In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [4]:
data = pd.read_csv('Absenteeism_data.csv')
data.head(20)

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
5,3,23,10/07/2015,179,51,38,239.554,31,1,0,0,2
6,10,22,17/07/2015,361,52,28,239.554,27,1,1,4,8
7,20,23,24/07/2015,260,50,36,239.554,23,1,4,0,4
8,14,19,06/07/2015,155,12,34,239.554,25,1,2,0,40
9,1,22,13/07/2015,235,11,37,239.554,29,3,1,1,8


In [3]:
# Check no. of nulls
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


Reason for being absent from work can be split into 4 distinct types
Rather than having 29 different columns for each specific reasons,
we can group as below.

In [49]:
# Get dummies for the "Reason for absence column"
dummy_cols = pd.get_dummies(data['Reason for Absence'])

data_copy = data.copy()

# Group the different types of abesences into specific groups
data_copy['group_1'] = dummy_cols.apply(lambda x: x.values[:15].sum(), axis = 1)
data_copy['group_2'] = dummy_cols.apply(lambda x: x.values[15:18].sum(), axis = 1)
data_copy['group_3'] = dummy_cols.apply(lambda x: x.values[18:22].sum(), axis = 1)
data_copy['group_4'] = dummy_cols.apply(lambda x: x.values[22:].sum(), axis = 1)

data_copy.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,group_1,group_2,group_3,group_4
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0,1,0,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [50]:
# drop reason for absence column
data_copy.drop('Reason for Absence', axis = 1, inplace=True)

In [51]:
from datetime import datetime
data_copy['month'] = data_copy['Date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y').month)
data_copy['day of week'] = data_copy['Date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y').weekday())
data_copy.head()

Unnamed: 0,ID,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,group_1,group_2,group_3,group_4,month,day of week
0,11,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1,7,1
1,36,14/07/2015,118,13,50,239.554,31,1,1,0,0,1,0,0,0,7,1
2,3,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1,7,2
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0,7,3
4,11,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1,7,3


In [52]:
# Drop the date column
data_copy.drop('Date', axis = 1, inplace = True)

In [57]:
data_copy.Education.unique()

array([1, 3, 2, 4])

Education:
    0: High school degree
    1: Graduate
    2: Post Grad
    3: Master or PhD
    
We will transform into a binary category

In [70]:
# convert education (4 categories) into binary category 1-->0  and 2, 3, 4, --> 1
data_copy['bEducation'] = data_copy['Education'].apply(lambda x: 0 if x == 1 else 1)
data_copy[['bEducation', 'Education']].head(10)

Unnamed: 0,bEducation,Education
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
5,0,1
6,0,1
7,0,1
8,0,1
9,1,3


In [71]:
# Drop id column

data_copy.drop('ID', axis =1, inplace=True)

In [72]:
data_copy.head()

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,group_1,group_2,group_3,group_4,month,day of week,bEducation
0,289,36,33,239.554,30,1,2,1,4,0,0,0,1,7,1,0
1,118,13,50,239.554,31,1,1,0,0,1,0,0,0,7,1,0
2,179,51,38,239.554,31,1,0,0,2,0,0,0,1,7,2,0
3,279,5,39,239.554,24,1,2,0,4,1,0,0,0,7,3,0
4,289,36,33,239.554,30,1,2,1,2,0,0,0,1,7,3,0


## Logistic regression model

First:

    - Split into test train sets
    - Standardize features that need standardizing
    - Clean up data where necessary

In [95]:
y = data_copy['Absenteeism Time in Hours'] # dependent var
X = data_copy.drop('Absenteeism Time in Hours', axis = 1) # independent vars

# Split into test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create Scaler object to standardise features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Features to be standardised
cols_to_scale = ['Transportation Expense', 'Distance to Work', \
                'Age', 'Daily Work Load Average', 'Body Mass Index']

X_scaled = scaler.fit_transform(X_train[cols_to_scale])
X_scaled_test = scaler.transform(X_test[cols_to_scale])

train_scaled_df = pd.DataFrame(X_scaled, columns = cols_to_scale)
test_scaled_df = pd.DataFrame(X_scaled_test, columns = cols_to_scale)

# Add standardised features to new df with original df (drop original unstandardised features)
X_train_scaled = pd.concat([X_train.drop(cols_to_scale, axis = 1).reset_index(), \
                           train_scaled_df], axis = 1)
X_test_scaled = pd.concat([X_test.drop(cols_to_scale, axis = 1).reset_index(), \
                           test_scaled_df], axis = 1)

# Remove index (caused when shuffling in train test split and resetting index)
X_test_scaled.drop('index', inplace=True, axis = 1)
X_train_scaled.drop('index', inplace=True, axis = 1)

X_train_scaled.head()

Unnamed: 0,Education,Children,Pets,group_1,group_2,group_3,group_4,month,day of week,bEducation,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index
0,1,0,0,1,0,0,0,11,3,0,-0.663158,1.395943,0.219843,0.890554,0.998129
1,1,1,2,1,0,0,0,9,0,0,0.029553,-0.271423,-1.334604,-0.758513,-0.645157
2,1,0,0,0,0,1,0,6,0,0,0.345791,-0.338118,0.686177,2.700691,-0.879913
3,3,0,0,0,0,0,1,4,4,1,-0.663158,-0.271423,-1.023714,-0.636185,-1.818933
4,1,0,0,0,0,0,1,11,0,0,-1.581753,-1.338538,0.064399,-0.07104,0.293863


Just noticed Education is still in table. Should be removed (we have bEducation)

In [None]:
# Drop Education column

X_test_scaled.drop('Education', inplace=True, axis = 1)
X_train_scaled.drop('Education', inplace=True, axis = 1)

In [98]:
# Start model with default hyperparameters
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(X_train_scaled, y_train)
lr_clf.score(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.49642857142857144

In [103]:
from sklearn.model_selection import GridSearchCV

lr_clf = LogisticRegression(max_iter = 1000, random_state=42)

params = {'penalty': ['l1', 'l2'],
         'C': [0.01, 0.1, 1, 3, 5, 10],
         'solver': ['liblinear']}

model = GridSearchCV(lr_clf, params, cv=5)
model.fit(X_train_scaled, y_train)
model.best_params_



{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}

In [105]:
lr_clf = LogisticRegression(max_iter = 1000, random_state=42, C = 1, penalty= 'l1', solver = 'liblinear')
lr_clf.fit(X_train_scaled, y_train)
lr_clf.score(X_test_scaled, y_test)

0.4