In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Logistic regression model

First:

    - Split into test train sets
    - Standardize features
    - Clean up data where necessary

In [2]:
preprocessed_data = pd.read_csv('Absenteeism_preprocessed.csv')

y = preprocessed_data['Absenteeism Time in Hours'] # dependent var
X = preprocessed_data.drop('Absenteeism Time in Hours', axis = 1) # independent vars

# Split into test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create Scaler object to standardise features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Features to be standardised
cols_to_scale = ['Transportation Expense', 'Distance to Work', \
                'Age', 'Daily Work Load Average', 'Body Mass Index', 'month', 'day of week', 'Children', 'Pets']

X_scaled = scaler.fit_transform(X_train[cols_to_scale])
X_scaled_test = scaler.transform(X_test[cols_to_scale])

train_scaled_df = pd.DataFrame(X_scaled, columns = cols_to_scale)
test_scaled_df = pd.DataFrame(X_scaled_test, columns = cols_to_scale)

# Add standardised features to new df with original df (drop original unstandardised features)
train_scaled = pd.concat([X_train.drop(cols_to_scale, axis = 1).reset_index(), \
                           train_scaled_df], axis = 1)
test_scaled = pd.concat([X_test.drop(cols_to_scale, axis = 1).reset_index(), \
                           test_scaled_df], axis = 1)

# Remove index (caused when shuffling in train test split and resetting index)
test_scaled.drop('index', inplace=True, axis = 1)
train_scaled.drop('index', inplace=True, axis = 1)

train_scaled.head()

Unnamed: 0,Education,group_1,group_2,group_3,group_4,bEducation,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,month,day of week,Children,Pets
0,1,1,0,0,0,0,-0.663158,1.395943,0.219843,0.890554,0.998129,1.357572,0.649803,-0.905873,-0.567181
1,1,0,0,0,0,0,0.029553,-0.271423,-1.334604,-0.758513,-0.645157,0.784022,-1.356943,-0.02669,1.100123
2,1,0,0,1,0,0,0.345791,-0.338118,0.686177,2.700691,-0.879913,-0.076303,-1.356943,-0.905873,-0.567181
3,3,0,0,0,1,1,-0.663158,-0.271423,-1.023714,-0.636185,-1.818933,-0.649852,1.318719,-0.905873,-0.567181
4,1,0,0,0,1,0,-1.581753,-1.338538,0.064399,-0.07104,0.293863,1.357572,-1.356943,-0.905873,-0.567181


In [3]:
# Drop Education column

train_scaled.drop('Education', inplace=True, axis = 1)
test_scaled.drop('Education', inplace=True, axis = 1)

### Target feature engineering

    - Distinguish between groups that have been excessively absent and those that have been moderately absent
    - We can use the medium value of Absenteeism time as cut off
    - if value >  medium -> excessively absent

In [4]:
y_train.head()

82     8
51     0
220    8
669    2
545    8
Name: Absenteeism Time in Hours, dtype: int64

In [5]:
threshold = y_train.median() # 3 hours

train_targets = y_train.apply(lambda x: 0 if x <= threshold else 1)
test_targets = y_test.apply(lambda x: 0 if x <= threshold else 1)

train_targets.head()

82     1
51     0
220    1
669    0
545    1
Name: Absenteeism Time in Hours, dtype: int64

### Look at balance of targets

In [6]:
train_targets.sum()/train_targets.shape[0]

0.4660714285714286

### Build models

#### Use non-optimised model as baseline

In [7]:
# Start model with default hyperparameters
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(train_scaled, train_targets)
lr_clf.score(test_scaled, test_targets)

0.8071428571428572

### Check score on training set

In [8]:
lr_clf.score(train_scaled, train_targets)

0.7839285714285714

### Optimise with gridsearch

In [9]:
from sklearn.model_selection import GridSearchCV

lr_clf1 = LogisticRegression(max_iter = 1000, random_state=42)

params = {'penalty': ['l1', 'l2'],
         'C': [0.01, 0.1, 1, 3, 5, 10],
         'solver': ['liblinear']}

model = GridSearchCV(lr_clf1, params, cv=5)
model.fit(train_scaled, train_targets)
model.best_params_

{'C': 5, 'penalty': 'l1', 'solver': 'liblinear'}

### Final model

In [10]:
lr_clf_opt = LogisticRegression(max_iter = 1000, random_state=42, C = 5, penalty= 'l1', solver = 'liblinear')
lr_clf_opt.fit(train_scaled, train_targets)
lr_clf_opt.score(test_scaled, test_targets)

0.8071428571428572

### Find coefficients and intercepts

In [11]:
lr_clf_opt.intercept_

array([-4.41397083])

In [12]:
lr_clf_opt.coef_

array([[ 5.80597026,  4.31074856,  6.52822243,  3.53827747, -0.61226306,
         0.75066957, -0.12235074, -0.3068625 , -0.04648414,  0.26273031,
         0.12828057, -0.15184481,  0.43074674, -0.57379703]])

In [13]:
features = train_scaled.columns.values

summary_table = pd.DataFrame(features, columns = ['Feature name'])
summary_table['coefficients'] = lr_clf_opt.coef_[0]
summary_table

Unnamed: 0,Feature name,coefficients
0,group_1,5.80597
1,group_2,4.310749
2,group_3,6.528222
3,group_4,3.538277
4,bEducation,-0.612263
5,Transportation Expense,0.75067
6,Distance to Work,-0.122351
7,Age,-0.306863
8,Daily Work Load Average,-0.046484
9,Body Mass Index,0.26273


### Save Model and Scaler

In [16]:
import pickle
with open('model', 'wb') as file:
    pickle.dump(lr_clf_opt, file)
    
with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)