In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from  sklearn.model_selection import train_test_split

In [4]:
data_preprocessed = pd.read_csv('./Absenteeism_preprocessed1.csv')
data_preprocessed['Absenteeism Time in Hours'].median()


3.0

In [None]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours']>3,1,0)



In [7]:
data_preprocessed['Excessive Absentees'] = targets

data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week',
                                            'Daily Work Load Average','Distance to Work'],axis=1)

data_with_targets

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absentees
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [8]:
unscaled_inputs = data_with_targets.iloc[:,:-1]


#Start with model

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std


    def fit(self, X, y=None):
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [10]:

columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [11]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [12]:
absenteeism_scaler = CustomScaler(columns_to_scale)
absenteeism_scaler.fit(unscaled_inputs)


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [13]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs.shape

(700, 11)

In [14]:
x_train,x_test,y_train,y_test =train_test_split(scaled_inputs,targets,train_size=0.8)


##REGRESSION MODEL

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
reg = LogisticRegression()


In [16]:
reg.fit(x_train,y_train)
reg.score(x_train,y_train)

0.7535714285714286

In [17]:
reg.coef_,reg.intercept_

(array([[ 2.48348965,  0.71172747,  2.98905209,  0.79295743,  0.08916246,
          0.5518094 , -0.18843429,  0.29547943,  0.11552566,  0.4680575 ,
         -0.24772328]]),
 array([-1.5032264]))

In [18]:
feature_name = unscaled_inputs.columns.values

summary_table = pd.DataFrame(columns=['Feature name'],data=feature_name)

summary_table['Coefficient'] =  np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.48349
1,Reason_2,0.711727
2,Reason_3,2.989052
3,Reason_4,0.792957
4,Month Value,0.089162
5,Transportation Expense,0.551809
6,Age,-0.188434
7,Body Mass Index,0.295479
8,Education,0.115526
9,Children,0.468058


In [19]:
summary_table.index = summary_table.index + 1

summary_table.loc[0] = ['Intercept',reg.intercept_[0]]
summary_table = summary_table.sort_index() 

In [20]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.503226,0.222411
1,Reason_1,2.48349,11.983008
2,Reason_2,0.711727,2.037508
3,Reason_3,2.989052,19.866842
4,Reason_4,0.792957,2.209922
5,Month Value,0.089162,1.093258
6,Transportation Expense,0.551809,1.736392
7,Age,-0.188434,0.828255
8,Body Mass Index,0.295479,1.34377
9,Education,0.115526,1.122463


#Save The model

In [21]:
import pickle

In [24]:
with open('model','wb') as file:
    pickle.dump(reg,file)

    
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)
    

In [23]:
reg.score(x_test,y_test)

0.8214285714285714