In [64]:
# importing models
import pandas as pd
import numpy as np 
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [65]:
# reading data 
df = pd.read_csv("Absenteeism_preprocessed.csv")
df.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [66]:
# checking the median of hrs of absenteeism
df["Absenteeism Time in Hours"].median()

3.0

In [67]:
# applying  np.where to mask hrs more than the median as 1 and not as 0


In [68]:
targets = np.where(df["Absenteeism Time in Hours"] > df["Absenteeism Time in Hours"].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [69]:
# checking percentage of data with higher hrs of absenteesism
print("precent of more hrs is ", targets.sum()/targets.shape[0])

precent of more hrs is  0.45571428571428574


In [70]:
# drop irrelevant predictors
df_with_targets = df.drop(['Absenteeism Time in Hours','Day of the Week',
                                            'Daily Work Load Average','Distance to Work'],axis=1)
df_with_targets.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1


In [71]:
# add targets to df 
df_with_targets["Level of Absenteesim"] = targets 
df_with_targets.shape
#df_with_targets.head(5)

(700, 12)

In [72]:
df = df_with_targets.copy() # copying the data to avoid modification

In [79]:
# selecting all rows and columns to check the data 
df.iloc[:, :12]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Level of Absenteesim
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [80]:
# selecting all data without targets 
df.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [81]:
unscaled_inputs = df.iloc[:,:-1]

In [82]:
# standardizing datasets (since its not recommended to standardized
# one hot encoders), I create a custom standardizer for this data

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.copy = copy 
        self.with_mean = with_mean
        self.with_std = with_std
        self.scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        self.columns = columns 
        self.mean_= None 
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    def transform(self, X, y=None, copy=None):
        init_columns = X.columns
        
        #scale colums in definece with the class instance 
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        
        X = pd.concat([X_not_scaled, X_scaled], axis=1)[init_columns]
        
        return X

In [83]:
unscaled_inputs.columns.values 

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [84]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [89]:
absenteeism_scale = CustomScaler(columns_to_scale)

In [90]:
absenteeism_scale.fit(predictors)

CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pet'])

In [91]:
scaled_inputs = absenteeism_scale.transform(unscaled_inputs)

In [92]:
#scaled inputs
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.030796,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.568019,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.568019,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.568019,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.568019,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [93]:
#split data into train and test
from sklearn.model_selection import train_test_split

In [94]:
train_X, test_X, train_y, test_y = train_test_split(scaled_inputs, targets, test_size=0.2, random_state=49)

In [95]:
#check shape
print(train_X.shape, test_X.shape)

(560, 11) (140, 11)


In [96]:
# using logistic regression for the algorithm
from sklearn.linear_model import LogisticRegression 
from sklearn import metrics 

In [98]:
reg = LogisticRegression()

In [99]:
reg.fit(train_X, train_y)

LogisticRegression()

In [100]:
#model accuracy on training data
reg.score(train_X, train_y)

0.7625

In [101]:
feature_name = unscaled_inputs.columns.values

In [102]:
# organize optimized transfer function into a dataframe
summary_table = pd.DataFrame(columns=["Feature name"], data=feature_name)
summary_table["Coefficient"] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.891344
1,Reason_2,0.960707
2,Reason_3,3.000494
3,Reason_4,0.970079
4,Month Value,-0.025297
5,Transportation Expense,0.653419
6,Age,-0.216115
7,Body Mass Index,0.255729
8,Education,-0.059861
9,Children,0.377697


In [104]:
# add one more row for model intercept
summary_table.index = summary_table.index +1
summary_table.loc[0] = ["Intercept", reg.intercept_[0]]
summary_table

Unnamed: 0,Feature name,Coefficient
2,Reason_1,2.891344
3,Reason_2,0.960707
4,Reason_3,3.000494
5,Reason_4,0.970079
6,Month Value,-0.025297
7,Transportation Expense,0.653419
8,Age,-0.216115
9,Body Mass Index,0.255729
10,Education,-0.059861
11,Children,0.377697


In [105]:
# sort table to display 0 index at the top 
summary_table = summary_table.sort_index()
summary_table 

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.816319
1,Intercept,-1.816319
2,Reason_1,2.891344
3,Reason_2,0.960707
4,Reason_3,3.000494
5,Reason_4,0.970079
6,Month Value,-0.025297
7,Transportation Expense,0.653419
8,Age,-0.216115
9,Body Mass Index,0.255729


In [107]:
# adding odds ratio to give more sense to the coefficients
summary_table["Odds_ratio"] = np.exp(summary_table.Coefficient)
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
4,Reason_3,3.000494,20.095465
2,Reason_1,2.891344,18.017512
5,Reason_4,0.970079,2.638153
3,Reason_2,0.960707,2.613544
7,Transportation Expense,0.653419,1.922101
11,Children,0.377697,1.458921
9,Body Mass Index,0.255729,1.291403
6,Month Value,-0.025297,0.97502
10,Education,-0.059861,0.941896
8,Age,-0.216115,0.805643


In [108]:
# testing model
predict_proba = reg.predict_proba(test_X)

In [110]:
predict_proba[:,1] 

array([0.86899308, 0.59438735, 0.25883693, 0.81725985, 0.26175359,
       0.12887766, 0.21879261, 0.26707054, 0.26616816, 0.56895312,
       0.21750077, 0.2559415 , 0.81972289, 0.32008794, 0.31214315,
       0.76820334, 0.05348306, 0.682262  , 0.29531955, 0.7054395 ,
       0.05234439, 0.8159515 , 0.22932561, 0.32008794, 0.23066698,
       0.91662876, 0.48664669, 0.37605266, 0.5495191 , 0.67188346,
       0.12887766, 0.66922025, 0.85091528, 0.23066698, 0.70616213,
       0.508095  , 0.12020842, 0.5101593 , 0.78065254, 0.22139282,
       0.4921701 , 0.53636287, 0.70397175, 0.40880129, 0.61584424,
       0.62120559, 0.92028533, 0.68577086, 0.37427716, 0.73424411,
       0.54242509, 0.23066698, 0.38139903, 0.12181972, 0.22932561,
       0.29061296, 0.2227012 , 0.2983098 , 0.89102222, 0.22401508,
       0.1486461 , 0.22008996, 0.23201383, 0.46654453, 0.61521879,
       0.67232985, 0.49279307, 0.05464507, 0.47152603, 0.22008996,
       0.48522271, 0.25883693, 0.11940969, 0.12718653, 0.77673

In [111]:
# saving model 
import pickle

In [112]:
with open("model", "wb") as file:
    pickle.dump(reg, file)

In [113]:
with open("scaler", "wb") as file:
    pickle.dump(absenteeism_scale, file)