### Creating a logistic regression to predict absenteeism

#### Import relevant librabries

In [1]:
import pandas as pd
import numpy as np

#### Load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month_value,Day of the week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3


#### Create the targets

In [4]:
# Creating the classes of 'Moderately absent' and 'Excessively absent' by:
## Taking the median value of the 'Absenteesim Time in Hours' and use it as a cut-off mark creating a roughly equal balance

data_preprocessed['Absenteeism Time in Hours'].median()
## So <= 3 is 'Moderately absent' otherwise 'Excessively absent' 
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1,0)

In [5]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive Absenteeism'] = targets

In [7]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month_value,Day of the week,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3,0


In [8]:
# Checking the balance
targets.sum()/targets.shape[0]

0.45571428571428574

In [9]:
# Creating a checkpoint and dropping the Absenteeism Time in Hours column
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Date'], axis=1)

In [10]:
data_with_targets 

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month_value,Day of the week,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2,1
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2,0
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3,1
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3,0


#### Select the inputs for the regression

In [11]:
data_with_targets.shape

(700, 15)

In [12]:
data_with_targets.iloc[: , :-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month_value,Day of the week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3


In [13]:
unscaled_inputs =  data_with_targets.iloc[: , :-1]

In [14]:
unscaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month_value,Day of the week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3


#### Standardize the data

In [15]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()

# absenteesim_scaler would be used to subtract the mean and divide by the standard deviation variablewise

In [16]:
absenteeism_scaler.fit(unscaled_inputs)

StandardScaler()

In [17]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [18]:
scaled_inputs 

array([[-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
         0.18272635, -0.68370352],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         0.18272635, -0.68370352],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         0.18272635, -0.00772546],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
        -0.3882935 ,  0.66825259],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
        -0.3882935 ,  0.66825259],
       [-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
        -0.3882935 ,  0.66825259]])

In [19]:
scaled_inputs.shape

(700, 14)

#### Split the data into train & test and shuffle

##### Import the relevant module

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
train_test_split(scaled_inputs,targets)

# Splits the dataset to 4 arrays, training and test dataset with inputs and targets

[array([[ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
          0.18272635, -0.00772546],
        [-0.57735027, -0.09298136, -0.31448545, ...,  3.70119207,
         -0.67380342, -1.35968157],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
          1.61027597, -0.00772546],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
          1.32476605, -1.35968157],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         -1.24482327,  1.34423065],
        [ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
          0.18272635, -1.35968157]]),
 array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
          0.18272635, -1.35968157],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         -1.24482327,  0.66825259],
        [-0.57735027, -0.09298136, -0.31448545, ...,  1.12666297,
         -0.95931334,  1.34423065],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ...,  

In [22]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs,targets, train_size = 0.8, random_state = 20)

# train_size as 0.8 means 80% of dataset for training and 20% for test
## random_state gives the shuffle a pseudo random

In [23]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(560, 14) (560,)
(140, 14) (140,)


#### Logistic regression with sklearn

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

##### Training the model

In [25]:
reg = LogisticRegression()

In [26]:
reg.fit(x_train, y_train)

# fits the model according to the given data

LogisticRegression()

In [27]:
# Evaluating the model 
reg.score(x_train, y_train)

# 80% accurate

0.7839285714285714

##### Mannually check the accuracy
Accuracy means that the x% of the model outputs matches the target

In [28]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [29]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [30]:
print(np.sum(model_outputs == y_train)) # This is the total number of correct predictions
print(model_outputs.shape[0])# Total obeservation

439
560


In [31]:
np.sum(model_outputs == y_train)/model_outputs.shape[0] # accuracy

0.7839285714285714

#### FInding the intercepts and coefficients

In [32]:
reg.intercept_

array([-0.22206736])

In [33]:
reg.coef_

array([[ 2.07601767,  0.33504757,  1.56162303,  1.32927434,  0.70639316,
        -0.03986811, -0.20089491, -0.00456366,  0.31933564, -0.135508  ,
         0.38172443, -0.3332426 ,  0.18793677, -0.07062253]])

#### Creating a summary table to show the intercepts, cofficients and variables

In [34]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month_value', 'Day of the week'], dtype=object)

In [35]:
# new variable to house the variable names
feature_name = unscaled_inputs.columns.values

In [36]:
summary_table = pd.DataFrame(columns=['Feature_name'],data=feature_name)

In [37]:
# add the coefficients column to match the variable name
summary_table['Coefficient'] = np.transpose(reg.coef_)

In [38]:
summary_table

Unnamed: 0,Feature_name,Coefficient
0,Reason_1,2.076018
1,Reason_2,0.335048
2,Reason_3,1.561623
3,Reason_4,1.329274
4,Transportation Expense,0.706393
5,Distance to Work,-0.039868
6,Age,-0.200895
7,Daily Work Load Average,-0.004564
8,Body Mass Index,0.319336
9,Education,-0.135508


In [39]:
summary_table.index = summary_table.index + 1 #shift up all indices by 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]] # Add the intercept to the index 0
summary_table = summary_table.sort_index() # sort table
summary_table

Unnamed: 0,Feature_name,Coefficient
0,Intercept,-0.222067
1,Reason_1,2.076018
2,Reason_2,0.335048
3,Reason_3,1.561623
4,Reason_4,1.329274
5,Transportation Expense,0.706393
6,Distance to Work,-0.039868
7,Age,-0.200895
8,Daily Work Load Average,-0.004564
9,Body Mass Index,0.319336


#### Interpreting the coefficients