# Classification to predict Absenteeism

## Import the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Load the Data

In [2]:
df_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

## Target Variable

In [3]:
# Inspecting the target variable.
df_preprocessed['Absenteeism Time in Hours'].value_counts()

8      195
2      149
3      106
1       87
4       57
0       39
16      18
24      15
40       7
5        7
32       6
64       3
112      2
56       2
80       2
120      2
48       1
7        1
104      1
Name: Absenteeism Time in Hours, dtype: int64

'Absenteeism Time in Hours' has a lot of different values. We want to predict if an employee will be Excessively absent or Moderately absent.
The target variable will be prepared by taking the median value for absenteeism.
- The people below the median value will be moderate.
- The people above the median value will be excessive.

In [4]:
#Median value
pd.Series.median(df_preprocessed['Absenteeism Time in Hours'])

3.0

##### If Absent hours is greater than median, encode it to 1, and 0 otherwise.

In [5]:
targets = np.where(df_preprocessed['Absenteeism Time in Hours']>pd.Series.median(df_preprocessed['Absenteeism Time in Hours']),1,0)
Targets = pd.Series(targets,name='Target')

In [6]:
#Concatenate the new target variable with the original data.
df_preprocessed = pd.concat([df_preprocessed,Targets],axis=1)

#Drop the Absenteeism time in hours column.
df_preprocessed.drop(['Absenteeism Time in Hours'],axis = 1,inplace = True)

df_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Target
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


## Inputs and Targets

### Input features

In [7]:
inputs_unscaled = df_preprocessed.iloc[:,:-1]

### Target Feature

In [8]:
target = df_preprocessed.iloc[:,-1]

## Standardize

##### Since we have dummy variables that should not be scaled, we will define our own scaler.

In [9]:
dfcopy = inputs_unscaled.copy()

In [10]:
class MyScaler(BaseEstimator,TransformerMixin):
    def __init__(self,columns,with_mean=True,with_std=True,copy=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.array(np.mean(X[self.columns]))
        self.var_ = np.array(np.var(X[self.columns]))
        return self
    
    def transform(self,X,y=None,copy=None):
        initial_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled],axis=1)[initial_col_order]
    
    

In [11]:
#All columns except dummies.
col_to_scale = ['month','Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']

In [12]:
scaler = MyScaler(col_to_scale)

scaler.fit(inputs_unscaled)

MyScaler(columns=['month', 'Day of the Week', 'Transportation Expense',
                  'Distance to Work', 'Age', 'Daily Work Load Average',
                  'Body Mass Index', 'Children', 'Pets'],
         copy=None, with_mean=None, with_std=None)

In [13]:
inputs_scaled = scaler.transform(inputs_unscaled)

In [14]:
inputs_scaled.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.01928,-0.58969
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,-0.91903,-0.58969
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.58969
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487


In [15]:
inputs_scaled.shape

(700, 14)

## Train Test Split

In [16]:
X_train,X_test,Y_train,Y_test = train_test_split(inputs_scaled,target,test_size = 0.2,shuffle=True,random_state=10)

In [17]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(560, 14) (560,)
(140, 14) (140,)


## Model

In [18]:
model = LogisticRegression()

model.fit(X_train,Y_train)
model.score(X_train,Y_train)



0.7821428571428571

### GridSearch

In [19]:
model = LogisticRegression()

#Different solvers use slightly different methods. 
# Newton-cg,lbfgs use second derivatives and are slow for large datasets.(lbfgs is memory efficient as itonly stores last few updates)
# Liblinear uses coordinate descent, i.e moves toward minimum in one direction at a time in the feature space.
# SAG - Stochastic Avg Gradient Descent, uses a random sample of previous gradient values. Fast for big datasets.
# SAGA is same as above but uses L1 regularization. (sag ans saga are not robust to unscaled data)

solvers = ['newton-cg', 'lbfgs', 'liblinear','sag','saga']
c_values = [100,10,1,0.1,0.01]

param = dict(solver=solvers,C=c_values)

grid_search = GridSearchCV(estimator=model,param_grid=param,cv=5,n_jobs=-1,scoring='accuracy')

result = grid_search.fit(X_train,Y_train)

print("Best: %f using %s" % (result.best_score_, result.best_params_))
means = result.cv_results_['mean_test_score']
stds = result.cv_results_['std_test_score']
params = result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("Mean score : %f Std div : %f with params : %r" % (mean, stdev, param))

Best: 0.767857 using {'C': 100, 'solver': 'liblinear'}
Mean score : 0.766071 Std div : 0.038940 with params : {'C': 100, 'solver': 'newton-cg'}
Mean score : 0.766071 Std div : 0.038940 with params : {'C': 100, 'solver': 'lbfgs'}
Mean score : 0.767857 Std div : 0.038873 with params : {'C': 100, 'solver': 'liblinear'}
Mean score : 0.764286 Std div : 0.039330 with params : {'C': 100, 'solver': 'sag'}
Mean score : 0.760714 Std div : 0.044586 with params : {'C': 100, 'solver': 'saga'}
Mean score : 0.764286 Std div : 0.039330 with params : {'C': 10, 'solver': 'newton-cg'}
Mean score : 0.764286 Std div : 0.039330 with params : {'C': 10, 'solver': 'lbfgs'}
Mean score : 0.764286 Std div : 0.039330 with params : {'C': 10, 'solver': 'liblinear'}
Mean score : 0.760714 Std div : 0.044586 with params : {'C': 10, 'solver': 'sag'}
Mean score : 0.753571 Std div : 0.034599 with params : {'C': 10, 'solver': 'saga'}
Mean score : 0.753571 Std div : 0.025195 with params : {'C': 1, 'solver': 'newton-cg'}
Mea



Best: 0.767857 using {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}. We will use these parameters for our model.

In [20]:
model = LogisticRegression(penalty='l2',C=100,solver='newton-cg')
model.fit(X_train,Y_train)
model.score(X_train,Y_train)

0.7928571428571428

In [21]:
model.intercept_

array([-6.12389665])

In [22]:
coefficients = model.coef_
features = inputs_unscaled.columns.values

In [23]:
summary_table = pd.DataFrame(columns=['Feature name'],data=features)
summary_table['Coefficients'] = np.transpose(coefficients)
summary_table

Unnamed: 0,Feature name,Coefficients
0,Reason_1,7.415129
1,Reason_2,5.686193
2,Reason_3,8.163692
3,Reason_4,5.376639
4,month,0.093617
5,Day of the Week,-0.153674
6,Transportation Expense,0.805741
7,Distance to Work,0.009114
8,Age,-0.342009
9,Daily Work Load Average,0.020886


In [24]:
#Let's add the intercept into the dataframe.
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept',model.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficients
0,Intercept,-6.123897
1,Reason_1,7.415129
2,Reason_2,5.686193
3,Reason_3,8.163692
4,Reason_4,5.376639
5,month,0.093617
6,Day of the Week,-0.153674
7,Transportation Expense,0.805741
8,Distance to Work,0.009114
9,Age,-0.342009


In [25]:
#Convert the log(odds) to odds and order by decreasing feature importance.
summary_table['odds'] = np.exp(summary_table['Coefficients'])
summary_table = summary_table.sort_values(by = ['odds'],ascending=False)
summary_table

Unnamed: 0,Feature name,Coefficients,odds
3,Reason_3,8.163692,3511.125923
1,Reason_1,7.415129,1660.922909
2,Reason_2,5.686193,294.769417
4,Reason_4,5.376639,216.294079
7,Transportation Expense,0.805741,2.238356
13,Children,0.526218,1.692519
11,Body Mass Index,0.257762,1.294031
12,Education,0.142199,1.152806
5,month,0.093617,1.098139
10,Daily Work Load Average,0.020886,1.021106


##### Based on the coefficient values, it is safe to say that the features "Daily Work Load Average" and "Distance to Work" add no value to the model as the coeffs are almost 0. These features can be removed when deploying in a production system as a simple model would be more preferable.

### Testing The Model

In [26]:
predictions = model.predict(X_test)
print(f"Testing accuracy : {model.score(X_test,Y_test)}")

Testing accuracy : 0.7285714285714285


In [27]:
probabilities = model.predict_proba(X_test)

In [28]:
probabilities[0]

array([0.8368732, 0.1631268])

This array indicates the probability of the example being of class 0 and class 1. We can only take the probability of the example being 1.

In [29]:
Probability_excessive_abesteeism = probabilities[:,1]

In [30]:
Probability_excessive_abesteeism[0:5]

array([0.1631268 , 0.19540981, 0.6970641 , 0.70664527, 0.44540377])

## Save the Model

In [31]:
import pickle

In [32]:
#Saving the model with the coefficients.

with open('Absenteeism_Model','wb') as file:
    pickle.dump(model,file)      

In [33]:
#Saving our custom scaler.

with open('Custom_Scaler','wb') as file:
    pickle.dump(scaler,file)