# Creating a Logistic Regression to Predict Absenteeism

## Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd

## Load preprocessed data

In [2]:
data_preprocessed = pd.read_csv("Absenteeism_preprocessed.csv")

In [3]:
data_preprocessed.head(10)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
5,0,0,0,1,10,2,179,51,38,239.554,31,0,0,0,2
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4,8
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4
8,0,0,1,0,6,6,155,12,34,239.554,25,0,2,0,40
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1,8


## Create the targets

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

If an observation has been absent for >= 4 hrs then we will say he/she is Excesively Absent

If an observation has been absent for <= 3 hrs then we will say he/she is Moderately Absent

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
data_preprocessed['Excessive Absenteeism'] = targets

In [8]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


Let us check the percentage of each class present in the dataset

In [9]:
targets.sum() / targets.shape[0]

0.45571428571428574

So 46% observations are of excessive absenteeism.

For logistic regression 60-40 split is good enough although for other algorithms it might differ.

We can proceed because our 2 groups are distributed roughly equally.

In [10]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Month Value', 
                                            'Day of the Week', 'Distance to Work', 'Daily Work Load Average'], axis=1)

In [11]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,289,33,30,0,2,1,1
1,0,0,0,0,118,50,31,0,1,0,0
2,0,0,0,1,179,38,31,0,0,0,0
3,1,0,0,0,279,39,24,0,2,0,1
4,0,0,0,1,289,33,30,0,2,1,0


## Creating Inputs for Logistic Regression

In [12]:
data_with_targets.shape

(700, 11)

In [13]:
data_with_targets.iloc[:, :14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,289,33,30,0,2,1,1
1,0,0,0,0,118,50,31,0,1,0,0
2,0,0,0,1,179,38,31,0,0,0,0
3,1,0,0,0,279,39,24,0,2,0,1
4,0,0,0,1,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,40,22,1,2,0,1
696,1,0,0,0,225,28,24,0,1,2,0
697,1,0,0,0,330,28,25,1,0,0,1
698,0,0,0,1,235,32,25,1,0,0,0


In [14]:
data_with_targets.iloc[:, :-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,289,33,30,0,2,1
1,0,0,0,0,118,50,31,0,1,0
2,0,0,0,1,179,38,31,0,0,0
3,1,0,0,0,279,39,24,0,2,0
4,0,0,0,1,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,40,22,1,2,0
696,1,0,0,0,225,28,24,0,1,2
697,1,0,0,0,330,28,25,1,0,0
698,0,0,0,1,235,32,25,1,0,0


In [15]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

## Standardize the data

In [16]:
# from sklearn.preprocessing import StandardScaler

# absenteeism_scaler = StandardScaler()

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_scaled, X_not_scaled], axis=1)[init_col_order]

In [18]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [19]:
columns_to_omit = ['Month Value', 'Day of the Week', 'Distance to Work', 'Daily Work Load Average']

In [20]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]
columns_to_scale

['Reason_1',
 'Reason_2',
 'Reason_3',
 'Reason_4',
 'Transportation Expense',
 'Age',
 'Body Mass Index',
 'Education',
 'Children',
 'Pets']

In [21]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [22]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
                      'Transportation Expense', 'Age', 'Body Mass Index',
                      'Education', 'Children', 'Pets'])

In [23]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [24]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,-0.577350,-0.092981,-0.314485,0.821365,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
1,-0.577350,-0.092981,-0.314485,-1.217485,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690
2,-0.577350,-0.092981,-0.314485,0.821365,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690
3,1.732051,-0.092981,-0.314485,-1.217485,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690
4,-0.577350,-0.092981,-0.314485,0.821365,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...
695,1.732051,-0.092981,-0.314485,-1.217485,-0.654143,0.562059,-1.114186,2.232242,0.880469,-0.589690
696,1.732051,-0.092981,-0.314485,-1.217485,0.040034,-1.320435,-0.643782,-0.447980,-0.019280,1.126663
697,1.732051,-0.092981,-0.314485,-1.217485,1.624567,-1.320435,-0.408580,2.232242,-0.919030,-0.589690
698,-0.577350,-0.092981,-0.314485,0.821365,0.190942,-0.692937,-0.408580,2.232242,-0.919030,-0.589690


In [25]:
scaled_inputs.shape

(700, 10)

## Split the data into training and testing

### Import train_test_split

In [26]:
from sklearn.model_selection import train_test_split

### Split the dataset

In [27]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Transportation Expense       Age  \
 639 -0.577350 -0.092981 -0.314485  0.821365                1.005844 -0.536062   
 297 -0.577350 -0.092981 -0.314485  0.821365                2.213108 -0.849811   
 398 -0.577350 -0.092981 -0.314485  0.821365               -0.654143  0.248310   
 103 -0.577350 -0.092981 -0.314485  0.821365                0.568211 -0.065439   
 451 -0.577350 -0.092981 -0.314485  0.821365                0.190942  1.032682   
 ..        ...       ...       ...       ...                     ...       ...   
 150 -0.577350 -0.092981 -0.314485  0.821365                0.040034 -1.320435   
 666 -0.577350 -0.092981 -0.314485  0.821365               -0.654143 -1.006686   
 436  1.732051 -0.092981 -0.314485 -1.217485                1.624567 -1.320435   
 477 -0.577350 -0.092981 -0.314485  0.821365                0.356940  0.718933   
 458  1.732051 -0.092981 -0.314485 -1.217485                1.624567 -1.320435   
 
      Body Mas

In [28]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [29]:
print(x_train.shape, y_train.shape)

(560, 10) (560,)


In [30]:
print(x_test.shape, y_test.shape)

(140, 10) (140,)


## Logistic Regression using sklearn

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [32]:
reg = LogisticRegression()

In [33]:
reg.fit(x_train, y_train)

LogisticRegression()

In [34]:
reg.score(x_train, y_train)

0.7857142857142857

### Manually checking the accuracy

In [35]:
model_outputs = reg.predict(x_train)

In [36]:
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [37]:
np.sum((model_outputs == y_train))

440

In [38]:
model_outputs.shape[0]

560

In [39]:
np.sum((model_outputs == y_train)) / model_outputs.shape[0]

0.7857142857142857

### Finding the intercept and coefficients

In [40]:
reg.intercept_

array([-0.21783308])

In [41]:
reg.coef_

array([[ 2.05206197,  0.32868197,  1.55227493,  1.29338028,  0.71605253,
        -0.20130691,  0.33021608, -0.13937001,  0.37936309, -0.31843246]])

In [42]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [43]:
feature_names = unscaled_inputs.columns.values

In [44]:
summary_table = pd.DataFrame(columns=["Feature name"], data=feature_names)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.052062
1,Reason_2,0.328682
2,Reason_3,1.552275
3,Reason_4,1.29338
4,Transportation Expense,0.716053
5,Age,-0.201307
6,Body Mass Index,0.330216
7,Education,-0.13937
8,Children,0.379363
9,Pets,-0.318432


In [45]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.217833
1,Reason_1,2.052062
2,Reason_2,0.328682
3,Reason_3,1.552275
4,Reason_4,1.29338
5,Transportation Expense,0.716053
6,Age,-0.201307
7,Body Mass Index,0.330216
8,Education,-0.13937
9,Children,0.379363


In [46]:
summary_table['Odds ratio'] = np.exp(summary_table['Coefficient'])

In [47]:
summary_table.sort_values('Odds ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds ratio
1,Reason_1,2.052062,7.783935
3,Reason_3,1.552275,4.722201
4,Reason_4,1.29338,3.645087
5,Transportation Expense,0.716053,2.046339
9,Children,0.379363,1.461354
7,Body Mass Index,0.330216,1.391269
2,Reason_2,0.328682,1.389136
8,Education,-0.13937,0.869906
6,Age,-0.201307,0.817661
0,Intercept,-0.217833,0.80426


## Testing the Model

In [48]:
reg.score(x_test, y_test)

0.7285714285714285

In [49]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.758545  , 0.241455  ],
       [0.5976238 , 0.4023762 ],
       [0.45325916, 0.54674084],
       [0.758545  , 0.241455  ],
       [0.06662528, 0.93337472],
       [0.27741784, 0.72258216],
       [0.29041286, 0.70958714],
       [0.06973283, 0.93026717],
       [0.74663336, 0.25336664],
       [0.758545  , 0.241455  ],
       [0.48118081, 0.51881919],
       [0.15611381, 0.84388619],
       [0.03473093, 0.96526907],
       [0.72488652, 0.27511348],
       [0.22259107, 0.77740893],
       [0.50919699, 0.49080301],
       [0.48118081, 0.51881919],
       [0.48118081, 0.51881919],
       [0.3622581 , 0.6377419 ],
       [0.03487047, 0.96512953],
       [0.74663336, 0.25336664],
       [0.758545  , 0.241455  ],
       [0.47583639, 0.52416361],
       [0.47583639, 0.52416361],
       [0.15363876, 0.84636124],
       [0.74663336, 0.25336664],
       [0.48918374, 0.51081626],
       [0.90003675, 0.09996325],
       [0.16214423, 0.83785577],
       [0.758545  , 0.241455  ],
       [0.

In [50]:
predicted_proba.shape

(140, 2)

In [51]:
predicted_proba[:, 1]

array([0.241455  , 0.4023762 , 0.54674084, 0.241455  , 0.93337472,
       0.72258216, 0.70958714, 0.93026717, 0.25336664, 0.241455  ,
       0.51881919, 0.84388619, 0.96526907, 0.27511348, 0.77740893,
       0.49080301, 0.51881919, 0.51881919, 0.6377419 , 0.96512953,
       0.25336664, 0.241455  , 0.52416361, 0.52416361, 0.84636124,
       0.25336664, 0.51081626, 0.09996325, 0.83785577, 0.241455  ,
       0.4023762 , 0.73522355, 0.72258216, 0.51881919, 0.241455  ,
       0.63030915, 0.25336664, 0.84889987, 0.4510874 , 0.6312766 ,
       0.241455  , 0.47611629, 0.25336664, 0.10916376, 0.83758399,
       0.68606432, 0.73903231, 0.241455  , 0.24290643, 0.241455  ,
       0.47611629, 0.06453712, 0.72258216, 0.24290643, 0.84637544,
       0.4023762 , 0.94595567, 0.25710307, 0.07947527, 0.07947527,
       0.71348339, 0.72258216, 0.25710307, 0.84636124, 0.22994078,
       0.241455  , 0.01172492, 0.25336664, 0.8389273 , 0.27511348,
       0.25336664, 0.07973541, 0.90800671, 0.4510874 , 0.63030

## Save the Model

In [52]:
import pickle

In [53]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [54]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)