## Creating a logistic regression to predict absenteeism

#### Import the relevant libraries

In [1]:
import pandas as pd 
import numpy as np 

Load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,36,33,239.554,30,0,2,1,2


### Create the targets

#### Take the median value of the 'Absenteeism Time in Hours' and use it as a cut off line

In [3]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [4]:
#targets
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,36,33,239.554,30,0,2,1,2,0


### Observation
#### By using the median we implicitly balanced the dataset
#### One half below the median, the other above

In [5]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [6]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'],axis=1)

##### Using 'is' to check if data_with_targets points to the same memory zone as data_with_targets

In [7]:
data_with_targets is data_with_targets

True

#### Select the inputs for the regression

In [8]:
#data_with_targets.iloc[:,:14] The same as below
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,16,32,237.656,25,1,0,0


In [9]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

### Standardize the data

<p><b>IMPORTANT NOTE</p></b>
When we standardize we lose the whole interpretability of a dummy
<p>IF they were left as 0 and 1 -> for a unit change it is 7.92 times more likely that a person wil lbe excessively absent </p>
<p>A unit change in the dummy variable -> from disregarding to taking only this dummy into account </p>
<p>If the reason given is Reason_1, it could say that it is 7.4 times more likely that a person will be excessively absent </p>
<p>However the reasons were standardized </p>


In [10]:
# In this case also the dummies were standardized
#from sklearn.preprocessing import StandardScaler
# abseenteism_scaler wii be used to substract the mean and divide by the standard deviation variablewise(featurewise)
#absenteeim_scaler = StandardScaler()

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator, TransformerMixin):
    """Custom scaler based on the StandardScaler
    It will only standardize the inputs given -> preserve the dummies intact
    Args:
        BaseEstimator (_type_): _description_
        TransformerMixin (_type_): _description_
    """
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
    
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        
    def transform(self,X,y=None,copy=None):
        print("transform")
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [12]:
unscaled_inputs.columns.values
columns_to_scale = unscaled_inputs.columns.values[4:] # Omit the Reason features(dummy variables)
columns_to_scale

array(['Month Value', 'Day of the week', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [13]:
absenteeism_scaler = CustomScaler(columns_to_scale)



##### Fit the input
###### Calculate the mean and standard deviation for each feature on scale_inputs
###### Preparing the scaling mechanism

In [14]:
absenteeism_scaler.fit(unscaled_inputs)

###### Apply the scaling mechanism

In [15]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

transform


In [16]:
scaled_inputs
#print(scaled_inputs.shape)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,0.412816,-0.536062,-0.806331,0.767431,-0.447980,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.141882,2.130803,-0.806331,1.002633,-0.447980,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.007725,1.426749,0.248310,-0.806331,1.002633,-0.447980,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.668253,-1.682647,0.405184,-0.806331,-0.643782,-0.447980,0.880469,-0.589690
4,0,0,0,1,0.182726,0.668253,0.412816,-0.536062,-0.806331,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.007725,-0.533522,0.562059,-0.853789,-1.114186,2.232242,0.880469,-0.589690
696,1,0,0,0,-0.388293,-0.007725,-0.263140,-1.320435,-0.853789,-0.643782,-0.447980,-0.019280,1.126663
697,1,0,0,0,-0.388293,0.668253,-0.939096,-1.320435,-0.853789,-0.408580,2.232242,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.668253,-0.939096,-0.692937,-0.853789,-0.408580,2.232242,-0.919030,-0.589690


## Split the data into train & test and shuffle
<b> Import relevant module

In [17]:
from sklearn.model_selection import train_test_split

<b>Split </b>
##### Return 4 arrays
1. Training dataset with inputs
2. training dataset with targets
3. test dataset with inputs
4. test dataset with targets

<p> train_test_split has the shuffle argument by default set to True
<p> random_state parameter takes an index and shuffle in the same random way, it makes it pseudo-random

In [18]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs,targets)

In [19]:
print("Train shapes: ", x_train.shape, y_train.shape)
print("Test shapes: ", x_test.shape, y_test.shape)
print("75 % training, 25 % test")

Train shapes:  (525, 13) (525,)
Test shapes:  (175, 13) (175,)
75 % training, 25 % test


In [20]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs,targets,train_size=0.8)

In [21]:
print("Train shapes: ", x_train.shape, y_train.shape)
print("Test shapes: ", x_test.shape, y_test.shape)
print("80 % training, 20 % test")

Train shapes:  (560, 13) (560,)
Test shapes:  (140, 13) (140,)
80 % training, 20 % test


## Split the data into train & test and shuffle Logistic regression with sklearn 

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

<b> Training the model </b>

In [23]:
reg = LogisticRegression() 
reg.fit(x_train, y_train)

LogisticRegression()

In [24]:
reg.score(x_train, y_train) # -> Mode

0.7357142857142858

<b> Manually check the accuracy </b>
<p> Building manually the score method </p>

In [25]:
model_outputs = reg.predict(x_train) # Find the predicted outputs
#model_outputs

In [26]:
np.sum((model_outputs==y_train))

412

In [27]:
#Accuracy = Correct prediction / # obeservations
print("Accuracy %.2f %%" % (np.sum((model_outputs==y_train)) / model_outputs.shape[0]))

Accuracy 0.74 %


<b> Finding the intercept and coefficients

In [28]:
print("Intercept: ", reg.intercept_)
print("Coefficients: ", reg.coef_)

Intercept:  [-1.46358626]
Coefficients:  [[ 2.46861459e+00  7.66847002e-01  2.66338115e+00  6.13098951e-01
   1.50625729e-01 -1.64967355e-01  8.99374509e-02 -4.50403911e-01
   2.43108222e-03  4.29232703e-01  4.34508535e-02  6.70653260e-01
  -9.59871070e-02]]


In [29]:
feature_name = unscaled_inputs.columns.values

<b> Create summary table </b>

In [30]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.468615
1,Reason_2,0.766847
2,Reason_3,2.663381
3,Reason_4,0.613099
4,Month Value,0.150626
5,Day of the week,-0.164967
6,Distance to Work,0.089937
7,Age,-0.450404
8,Daily Work Load Average,0.002431
9,Body Mass Index,0.429233


In [31]:
summary_table.index = summary_table.index + 1 # Shift up all indices by 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]] # Specify the 0 element -> extract the float
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.463586
1,Reason_1,2.468615
2,Reason_2,0.766847
3,Reason_3,2.663381
4,Reason_4,0.613099
5,Month Value,0.150626
6,Day of the week,-0.164967
7,Distance to Work,0.089937
8,Age,-0.450404
9,Daily Work Load Average,0.002431


<b> Interpreting the coefficients

In [32]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.463586,0.231405
1,Reason_1,2.468615,11.806079
2,Reason_2,0.766847,2.152967
3,Reason_3,2.663381,14.344709
4,Reason_4,0.613099,1.846144
5,Month Value,0.150626,1.162561
6,Day of the week,-0.164967,0.847921
7,Distance to Work,0.089937,1.094106
8,Age,-0.450404,0.637371
9,Daily Work Load Average,0.002431,1.002434


<h> Interpreting the coefficients </h>
A feature is not particularly important:
-   if its coefficient is around 0
- if ts odds ratio is around 1

<p>A weight(coefficient) of 0 implies that no matter the feature value, we will multiply it by 0(in the model)</p>
<p>For a unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio(1 = no change)</p>

<p>ODDS X ODDS RATIO = NEW ODDS</p>
<p>5:1  X      2       = 10:1</p>
<p>5:1  X     0.2      = 1:1</p>
<p>5:1  X      1       = 5:1</p>

In [33]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,2.663381,14.344709
1,Reason_1,2.468615,11.806079
2,Reason_2,0.766847,2.152967
12,Children,0.670653,1.955514
4,Reason_4,0.613099,1.846144
10,Body Mass Index,0.429233,1.536078
5,Month Value,0.150626,1.162561
7,Distance to Work,0.089937,1.094106
11,Education,0.043451,1.044409
9,Daily Work Load Average,0.002431,1.002434
