### Create a logistic regression to predict absenteeism

In [1]:
import pandas as pd
import numpy as np

### Load the preprocessed Data

In [2]:
df = pd.read_csv("Absenteeism_preprocessed.csv")

In [3]:
df.head()

Unnamed: 0,R1,R2,R3,R4,Month_value,weekday,Transportation Expense,Distance to Work,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Age1,Age2,Age3,Age4
0,0,0,0,1,7,1,289,36,239.554,30,0,2,1,4,0,1,0,0
1,0,0,0,0,7,1,118,13,239.554,31,0,1,0,0,0,0,0,1
2,0,0,0,1,7,2,179,51,239.554,31,0,0,0,2,0,1,0,0
3,1,0,0,0,7,3,279,5,239.554,24,0,2,0,4,0,1,0,0
4,0,0,0,1,7,3,289,36,239.554,30,0,2,1,2,0,1,0,0


### Create the target:

In [4]:
### we will use the median:
df['Absenteeism Time in Hours'].median()

3.0

In [5]:
target = np.where(df['Absenteeism Time in Hours'] > df['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
target[:5]

array([1, 0, 0, 1, 0])

In [7]:
df['Excessive absenteeism'] = target

In [8]:
df.head()

Unnamed: 0,R1,R2,R3,R4,Month_value,weekday,Transportation Expense,Distance to Work,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Age1,Age2,Age3,Age4,Excessive absenteeism
0,0,0,0,1,7,1,289,36,239.554,30,0,2,1,4,0,1,0,0,1
1,0,0,0,0,7,1,118,13,239.554,31,0,1,0,0,0,0,0,1,0
2,0,0,0,1,7,2,179,51,239.554,31,0,0,0,2,0,1,0,0,0
3,1,0,0,0,7,3,279,5,239.554,24,0,2,0,4,0,1,0,0,1
4,0,0,0,1,7,3,289,36,239.554,30,0,2,1,2,0,1,0,0,0


### Comments on the target:

the median as a cut-off line is numerically stable and rigid also it balances implicitly the data and is less sensitive to the outliers comparing to the mean

In [9]:
target.sum()/target.shape[0]#### almost 46% of the targets are 1 and 54% are 0

0.45571428571428574

In [10]:
dat = df.drop(columns = ['Absenteeism Time in Hours'])### To avoid multicollinearity

In [11]:
dat.head()

Unnamed: 0,R1,R2,R3,R4,Month_value,weekday,Transportation Expense,Distance to Work,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Age1,Age2,Age3,Age4,Excessive absenteeism
0,0,0,0,1,7,1,289,36,239.554,30,0,2,1,0,1,0,0,1
1,0,0,0,0,7,1,118,13,239.554,31,0,1,0,0,0,0,1,0
2,0,0,0,1,7,2,179,51,239.554,31,0,0,0,0,1,0,0,0
3,1,0,0,0,7,3,279,5,239.554,24,0,2,0,0,1,0,0,1
4,0,0,0,1,7,3,289,36,239.554,30,0,2,1,0,1,0,0,0


### Select the inputs for the model:

In [12]:
dat.shape

(700, 18)

In [13]:
data = dat.iloc[:,:-1]

In [14]:
data.head()

Unnamed: 0,R1,R2,R3,R4,Month_value,weekday,Transportation Expense,Distance to Work,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Age1,Age2,Age3,Age4
0,0,0,0,1,7,1,289,36,239.554,30,0,2,1,0,1,0,0
1,0,0,0,0,7,1,118,13,239.554,31,0,1,0,0,0,0,1
2,0,0,0,1,7,2,179,51,239.554,31,0,0,0,0,1,0,0
3,1,0,0,0,7,3,279,5,239.554,24,0,2,0,0,1,0,0
4,0,0,0,1,7,3,289,36,239.554,30,0,2,1,0,1,0,0


### Standardizing the data

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

scaler = StandardScaler()

In [16]:
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(data, target, train_size = 0.8, random_state = 42)

In [17]:
from sklearn.linear_model import LogisticRegression
lor = LogisticRegression()
pipeline = Pipeline([('scaler', scaler), ('estimator', lor)])

In [18]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('estimator', LogisticRegression())])

In [19]:
pipeline.predict(X_test)

array([0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0])

In [20]:
pipeline.score(X_train, y_train)

0.7696428571428572

In [21]:
pipeline.score(X_test, y_test)

0.7571428571428571

In [22]:
pred = pipeline.predict(X_test)

In [23]:
y_test

array([1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0])

In [24]:
y_test == pred

array([False, False, False, False,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True, False,  True,  True, False,
        True, False, False,  True, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True, False,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True, False,  True, False,  True,  True, False, False, False,
        True, False,

In [25]:
print('The intercept is {}'.format(pipeline.named_steps['estimator'].intercept_))

The intercept is [-0.17085964]


In [26]:
print('The coefficients are {}'.format(pipeline.named_steps['estimator'].coef_))

The coefficients are [[ 2.18772623  0.2963265   1.60727423  1.51405767  0.11713899 -0.15734047
   0.93810109 -0.16885314  0.0413442   0.22765636 -0.02266835  0.36053384
  -0.24484589 -0.07165879  0.21345911 -0.1457344  -0.08117841]]


In [27]:
data.columns.values

array(['R1', 'R2', 'R3', 'R4', 'Month_value', 'weekday',
       'Transportation Expense', 'Distance to Work',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Age1', 'Age2', 'Age3', 'Age4'], dtype=object)

In [28]:
feature_name = data.columns.values

In [29]:
coef = pipeline.named_steps['estimator'].coef_
summary = pd.DataFrame(columns = ['feature name'], data = feature_name)
summary['coefficient'] = np.transpose(coef)
summary

Unnamed: 0,feature name,coefficient
0,R1,2.187726
1,R2,0.296327
2,R3,1.607274
3,R4,1.514058
4,Month_value,0.117139
5,weekday,-0.15734
6,Transportation Expense,0.938101
7,Distance to Work,-0.168853
8,Daily Work Load Average,0.041344
9,Body Mass Index,0.227656


In [30]:
## to add intercept
summary.index = summary.index + 1
inter = pipeline.named_steps['estimator'].intercept_
summary.loc[0] = ['Intercept', inter[0]]
summary = summary.sort_index()

In [31]:
summary

Unnamed: 0,feature name,coefficient
0,Intercept,-0.17086
1,R1,2.187726
2,R2,0.296327
3,R3,1.607274
4,R4,1.514058
5,Month_value,0.117139
6,weekday,-0.15734
7,Transportation Expense,0.938101
8,Distance to Work,-0.168853
9,Daily Work Load Average,0.041344


In [32]:
predicted_proba = pipeline.predict_proba(X_test)

In [33]:
predicted_proba[:5]

array([[0.6370662 , 0.3629338 ],
       [0.84519907, 0.15480093],
       [0.83840556, 0.16159444],
       [0.49297319, 0.50702681],
       [0.50325267, 0.49674733]])

In [34]:
### Probability of excessive abteeism:
predicted_proba[:,1]

array([0.3629338 , 0.15480093, 0.16159444, 0.50702681, 0.49674733,
       0.95005825, 0.44137192, 0.73284962, 0.03437153, 0.30105227,
       0.13219923, 0.5264083 , 0.76335294, 0.47919081, 0.16575926,
       0.62106185, 0.01070527, 0.81312215, 0.12558568, 0.39667231,
       0.33177659, 0.24198894, 0.23908889, 0.53926951, 0.11063551,
       0.82503232, 0.3729442 , 0.11674831, 0.21664261, 0.40169096,
       0.11792213, 0.1661445 , 0.6564127 , 0.55048641, 0.16733901,
       0.62541936, 0.20291469, 0.16404833, 0.9389583 , 0.08080723,
       0.64393406, 0.24931352, 0.62751773, 0.18380704, 0.18610254,
       0.64091353, 0.7444342 , 0.88914066, 0.24083907, 0.17165759,
       0.23902462, 0.23791199, 0.44033333, 0.9513638 , 0.17402442,
       0.23870468, 0.98789901, 0.19875245, 0.89488193, 0.65437278,
       0.65745073, 0.13849571, 0.49416059, 0.73046583, 0.13219923,
       0.11575799, 0.72144674, 0.00663437, 0.19901274, 0.4212165 ,
       0.30105227, 0.22834582, 0.75292187, 0.23791199, 0.08916

### Save the pipeline

In [35]:
import pickle

with open('pipeline', 'wb') as file:
    pickle.dump(pipeline, file)