# Creating a logistic regression to predict absenteeism

## Importing the libraries

In [3]:
import pandas as pd
import numpy as np

## Loading the data

In [31]:
data_preprocessed = pd.read_csv('preprocessed.csv')

In [32]:
data_preprocessed = data_preprocessed.drop(['Unnamed: 0'], axis=1)
data_preprocessed.head(5)

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,1,0,0,0,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,1,0,0,0,7,2,179,51,38,239.554,31,0,0,0,2
3,1,1,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,1,0,0,0,7,3,289,36,33,239.554,30,0,2,1,2


In [10]:
# Reason_type1, Reason_type2, Reason_type3, Reason_type4,
# Distance to work, Daily Workload Average, Children, Pets
# are the reason for absenteeism to work

## Creating the targets

In [33]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [34]:
# Classes
# Moderately absent (<= 3 hours): Assign 0 for absent time less than 3 hours
# Excessively absent (>= 4 hours) : Assign 0 for absent time greater than  hours

# Absenteeism time greater than 3, return 1 otherwise return 0
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [35]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head(10)

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,1,0,0,0,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,1,0,0,0,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,1,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,1,0,0,0,7,3,289,36,33,239.554,30,0,2,1,2,0
5,1,0,0,0,7,4,179,51,38,239.554,31,0,0,0,2,0
6,1,0,0,0,7,4,361,52,28,239.554,27,0,1,4,8,1
7,1,0,0,0,7,4,260,50,36,239.554,23,0,4,0,4,1
8,1,0,0,1,7,0,155,12,34,239.554,25,0,2,0,40,1
9,1,0,0,0,7,0,235,11,37,239.554,29,1,1,1,8,1


## Select the inputs for the regression

In [37]:
targets.sum()

319

In [38]:
targets.shape[0]

700

In [39]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [40]:
 data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

In [41]:
data_with_targets.head(10)

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,1,0,0,0,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,1,0,0,0,7,2,179,51,38,239.554,31,0,0,0,0
3,1,1,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,1,0,0,0,7,3,289,36,33,239.554,30,0,2,1,0
5,1,0,0,0,7,4,179,51,38,239.554,31,0,0,0,0
6,1,0,0,0,7,4,361,52,28,239.554,27,0,1,4,1
7,1,0,0,0,7,4,260,50,36,239.554,23,0,4,0,1
8,1,0,0,1,7,0,155,12,34,239.554,25,0,2,0,1
9,1,0,0,0,7,0,235,11,37,239.554,29,1,1,1,1


## Next checkpoint

In [42]:
data_with_targets is data_preprocessed

False

## Selecting inputs for the regression

In [44]:
data_with_targets.shape

(700, 15)

In [46]:
## Selecting inputs for our model except the 'Excessive absenteeism'
## DateFrame.iloc[row indices, column indices]
## iloc excludes the last index

data_with_targets.iloc[:, :14]

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,1,0,0,0,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,1,0,0,0,7,2,179,51,38,239.554,31,0,0,0
3,1,1,0,0,7,3,279,5,39,239.554,24,0,2,0
4,1,0,0,0,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,1,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,1,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,1,0,0,5,3,330,16,28,237.656,25,1,0,0
698,1,0,0,0,5,3,235,16,32,237.656,25,1,0,0


In [47]:
data_with_targets.iloc[:, :-1]

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,1,0,0,0,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,1,0,0,0,7,2,179,51,38,239.554,31,0,0,0
3,1,1,0,0,7,3,279,5,39,239.554,24,0,2,0
4,1,0,0,0,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,1,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,1,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,1,0,0,5,3,330,16,28,237.656,25,1,0,0
698,1,0,0,0,5,3,235,16,32,237.656,25,1,0,0


In [49]:
unscaled_inputs = data_with_targets.iloc[:, :-1]
unscaled_inputs

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,1,0,0,0,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,1,0,0,0,7,2,179,51,38,239.554,31,0,0,0
3,1,1,0,0,7,3,279,5,39,239.554,24,0,2,0
4,1,0,0,0,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,1,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,1,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,1,0,0,5,3,330,16,28,237.656,25,1,0,0
698,1,0,0,0,5,3,235,16,32,237.656,25,1,0,0


## Standardizing the data

In [50]:
# absenteeism_scaler will be used to subtract the mean and 
# divide by the standard deviation variablewise(featurewise)

from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()

In [51]:
absenteeism_scaler.fit(unscaled_inputs)

StandardScaler()

In [52]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

array([[ 0.23958675, -0.57735027, -0.09298136, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-4.17385345, -0.57735027, -0.09298136, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [ 0.23958675, -0.57735027, -0.09298136, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 0.23958675,  1.73205081, -0.09298136, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [ 0.23958675, -0.57735027, -0.09298136, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [ 0.23958675, -0.57735027, -0.09298136, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [53]:
scaled_inputs.shape    

(700, 14)

## Splitting the data into train, test and shuffle

### importing relevant module 

In [54]:
from sklearn.model_selection import train_test_split

### Splitting data

In [55]:
train_test_split(scaled_inputs, targets)

[array([[ 0.23958675, -0.57735027, -0.09298136, ..., -0.44798003,
          0.88046927,  0.26848661],
        [ 0.23958675, -0.57735027, -0.09298136, ...,  2.23224237,
         -0.91902997, -0.58968976],
        [ 0.23958675, -0.57735027, -0.09298136, ..., -0.44798003,
          0.88046927, -0.58968976],
        ...,
        [ 0.23958675, -0.57735027, -0.09298136, ..., -0.44798003,
          0.88046927,  0.26848661],
        [ 0.23958675,  1.73205081, -0.09298136, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [ 0.23958675, -0.57735027, -0.09298136, ..., -0.44798003,
         -0.01928035,  1.12666297]]),
 array([[ 0.23958675,  1.73205081, -0.09298136, ..., -0.44798003,
         -0.01928035,  0.26848661],
        [ 0.23958675, -0.57735027, -0.09298136, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [ 0.23958675, -0.57735027, -0.09298136, ..., -0.44798003,
         -0.01928035,  1.12666297],
        ...,
        [ 0.23958675, -0.57735027, -0.09298136, ...,  

In [77]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8)

In [78]:
# 80% of the data for testing
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [79]:
# 20% of the data for testing
print(x_test.shape, y_test.shape)

(140, 14) (140,)


## Logistic regression with sklearn

In [80]:
from sklearn.linear_model import LogisticRegression

### Training the model

In [81]:
reg = LogisticRegression()

In [82]:
reg.fit(x_train, y_train)

LogisticRegression()

In [84]:
reg.score(x_train, y_train)

0.7767857142857143

### Manually checking the accuracy

In [86]:
model_outputs = reg.predict(x_train)
model_outputs  

array([0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,

In [87]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,

In [88]:
model_outputs == y_train

array([ True,  True,  True,  True,  True, False,  True, False,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
        True,  True, False, False,  True,  True, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True, False,
        True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
       False,  True,  True, False,  True,  True,  True, False, False,
       False,  True,  True,  True, False,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True, False,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,

In [89]:
# number of correct predictions (true entires)
np.sum(model_outputs == y_train)

435

In [90]:
# accuracy = correct predictions / number of observations

np.sum(model_outputs == y_train) / model_outputs.shape[0]

0.7767857142857143

### Finding the intercept and coefficients

In [91]:
reg.intercept_

array([-0.30776681])

In [92]:
reg.coef_

array([[ 1.12864254,  0.85810645,  0.07654382,  0.73928197,  0.05322407,
        -0.171442  ,  0.65056433, -0.02779615, -0.35013092, -0.00467756,
         0.33087312,  0.00770604,  0.50858921, -0.30941585]])

In [93]:
unscaled_inputs.columns.values

array(['reason_type1', 'reason_type2', 'reason_type3', 'reason_type4',
       'Month Value', 'Day of the Week', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets'], dtype=object)

In [94]:
feature_name = unscaled_inputs.columns.values

In [95]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_name)

summary_table['coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,coefficient
0,reason_type1,1.128643
1,reason_type2,0.858106
2,reason_type3,0.076544
3,reason_type4,0.739282
4,Month Value,0.053224
5,Day of the Week,-0.171442
6,Transportation Expense,0.650564
7,Distance to Work,-0.027796
8,Age,-0.350131
9,Daily Work Load Average,-0.004678


In [97]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['intercept', reg.intercept_[0]]
summary_table

Unnamed: 0,Feature name,coefficient
2,reason_type1,1.128643
3,reason_type2,0.858106
4,reason_type3,0.076544
5,reason_type4,0.739282
6,Month Value,0.053224
7,Day of the Week,-0.171442
8,Transportation Expense,0.650564
9,Distance to Work,-0.027796
10,Age,-0.350131
11,Daily Work Load Average,-0.004678
