In [46]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
import pickle

### Load data

In [40]:
data_pro = pd.read_csv('Absenteeism_preprocessed.csv')
data_pro

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Values,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2


### Create targets

In [3]:
data_pro['Absenteeism Time in Hours'].median()

3.0

In [4]:
targets = np.where(data_pro['Absenteeism Time in Hours'] > 
                   data_pro['Absenteeism Time in Hours'].median(),1,0)

In [5]:
data_pro['Excessive Absenteeism'] = targets
data_pro

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Values,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2,0


### Comment targets

In [6]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [7]:
data_prepro = data_pro.drop(['Absenteeism Time in Hours','Day of the week',
                            'Daily Work Load Average','Distance to Work'],axis=1)
data_prepro

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Values,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


### Select inputs

In [8]:
data_prepro.shape

(700, 12)

In [9]:
unscaled_inputs = data_prepro.iloc[:,:-1]

In [41]:
unscaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Values,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


### Standerdize data

In [10]:
class CustomScaler(BaseEstimator,TransformerMixin):
    
    def __init__(self,columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self,X,y=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled],axis=1)[init_col_order]


In [11]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Values',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [12]:
columns_to_omit = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4']

In [13]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [14]:
abs_scaler = CustomScaler(columns_to_scale)

In [15]:
abs_scaler.fit(unscaled_inputs)

In [16]:
scaled_inputs = abs_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Values,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,2.232242,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,-0.447980,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,2.232242,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,2.232242,-0.919030,-0.589690


### Split and shuffle

In [17]:
train_inputs,train_test,targets_inputs,targets_test = train_test_split(scaled_inputs,
                                                                       targets,
                                                                       train_size=0.8,
                                                                       shuffle=True,
                                                                       random_state=20)

In [18]:
print(train_inputs.shape,train_test.shape)

(560, 11) (140, 11)


In [19]:
print(targets_inputs.shape,targets_test.shape)

(560,) (140,)


### Training model

In [20]:
reg = LogisticRegression()
reg.fit(train_inputs,targets_inputs)

In [21]:
reg.score(train_inputs,targets_inputs)

0.7732142857142857

### Manually check accuracy

In [22]:
model_outputs = reg.predict(train_inputs)

In [23]:
x = model_outputs == targets_inputs

In [24]:
z = x.shape[0] - x.sum()

In [25]:
z,x.shape[0],x.sum()

(127, 560, 433)

In [26]:
x.sum() / x.shape[0]

0.7732142857142857

### Find intercept and coefficients

In [27]:
reg.intercept_

array([-1.68218645])

In [28]:
reg.coef_

array([[ 2.80050879,  0.9550102 ,  3.11773817,  0.83794575,  0.15857206,
         0.60562747, -0.17008141,  0.27770724, -0.08455641,  0.34701569,
        -0.27797034]])

In [29]:
feature_name = unscaled_inputs.columns.values

In [30]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)

In [31]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept',reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.682186
1,Reason 1,2.800509
2,Reason 2,0.95501
3,Reason 3,3.117738
4,Reason 4,0.837946
5,Month Values,0.158572
6,Transportation Expense,0.605627
7,Age,-0.170081
8,Body Mass Index,0.277707
9,Education,-0.084556


### Interpreting coefficients

In [32]:
summary_table['Odds ratio'] = np.exp(summary_table.Coefficient)

In [33]:
summary_table.sort_values('Odds ratio',ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds ratio
3,Reason 3,3.117738,22.595215
1,Reason 1,2.800509,16.453016
2,Reason 2,0.95501,2.598697
4,Reason 4,0.837946,2.311613
6,Transportation Expense,0.605627,1.832402
10,Children,0.347016,1.414839
8,Body Mass Index,0.277707,1.3201
5,Month Values,0.158572,1.171836
9,Education,-0.084556,0.91892
7,Age,-0.170081,0.843596


### Test model

In [35]:
reg.score(train_test,targets_test)

0.75

In [37]:
pred_proba = reg.predict_proba(train_test)
pred_proba

array([[0.71325648, 0.28674352],
       [0.58656313, 0.41343687],
       [0.44112564, 0.55887436],
       [0.78133175, 0.21866825],
       [0.08392008, 0.91607992],
       [0.3342291 , 0.6657709 ],
       [0.2993341 , 0.7006659 ],
       [0.13086501, 0.86913499],
       [0.78564217, 0.21435783],
       [0.7488237 , 0.2511763 ],
       [0.49371246, 0.50628754],
       [0.22445984, 0.77554016],
       [0.07098565, 0.92901435],
       [0.73468839, 0.26531161],
       [0.30807191, 0.69192809],
       [0.54979463, 0.45020537],
       [0.55013546, 0.44986454],
       [0.53890721, 0.46109279],
       [0.40098283, 0.59901717],
       [0.05349724, 0.94650276],
       [0.6997517 , 0.3002483 ],
       [0.78133175, 0.21866825],
       [0.41957064, 0.58042936],
       [0.41957064, 0.58042936],
       [0.24661288, 0.75338712],
       [0.74506925, 0.25493075],
       [0.50946316, 0.49053684],
       [0.85800128, 0.14199872],
       [0.20285591, 0.79714409],
       [0.78133175, 0.21866825],
       [0.

In [45]:
pred_proba[:,1]

array([0.28674352, 0.41343687, 0.55887436, 0.21866825, 0.91607992,
       0.6657709 , 0.7006659 , 0.86913499, 0.21435783, 0.2511763 ,
       0.50628754, 0.77554016, 0.92901435, 0.26531161, 0.69192809,
       0.45020537, 0.44986454, 0.46109279, 0.59901717, 0.94650276,
       0.3002483 , 0.21866825, 0.58042936, 0.58042936, 0.75338712,
       0.25493075, 0.49053684, 0.14199872, 0.79714409, 0.21866825,
       0.37031375, 0.68010538, 0.68560828, 0.52889259, 0.21866825,
       0.53509531, 0.2220809 , 0.73802547, 0.4045697 , 0.60531851,
       0.21103167, 0.45055818, 0.2381166 , 0.39884737, 0.82740777,
       0.56944188, 0.69190691, 0.28674352, 0.22033975, 0.20359228,
       0.57434404, 0.32720156, 0.6657709 , 0.27050851, 0.83379804,
       0.43555357, 0.88424932, 0.23169099, 0.33361085, 0.34375056,
       0.69699469, 0.65562212, 0.29279038, 0.79300401, 0.20776315,
       0.26858753, 0.08760278, 0.2220809 , 0.73421318, 0.302073  ,
       0.2220809 , 0.29150074, 0.90416916, 0.46006255, 0.60162

### Save model

In [47]:
with open('model','wb') as file:
    pickle.dump(reg,file)

In [48]:
with open('scaler','wb') as file:
    pickle.dump(abs_scaler,file)