### The aim of this model is to predict if an employee will be absent for more than 3 hours during work hours

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
processed_data = pd.read_csv('df_preprocessed.csv')


In [3]:
target = processed_data['Absenteeism']

### Feature Scaling

In [4]:
processed_data.columns.values

array(['Unnamed: 0', 'reason_type1', 'reason_type2', 'reason_type3',
       'reason_type4', 'Month value', 'Day of the week',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism'], dtype=object)

In [26]:
# Backward elimination...
# column Day of the week','Daily Work Load Average','Distance to Work' ,
# have very low coefficient. that is they have less effect on the Absenteeism
# so it will be dropped


In [6]:
unscaled_features = processed_data.drop(['Unnamed: 0', 'Absenteeism','Day of the week',
                                            'Daily Work Load Average','Distance to Work' ], axis = 1) 

In [7]:
# scalling only numerical features

feature_scaler = ColumnTransformer(
[('scaler', StandardScaler(),[ 'Month value',
       'Transportation Expense', 'Age',
        'Body Mass Index',
       'Children', 'Pets'])], remainder='passthrough'
)

scaled_input = feature_scaler.fit_transform(unscaled_features)

### Train-Test split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train,x_test,y_train,y_test = train_test_split(scaled_input,target, train_size=0.8,random_state=20)

### Logistic Model

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
reg = LogisticRegression()

In [12]:
reg.fit(x_train,y_train)

LogisticRegression()

In [25]:
train_accuracy = reg.score(x_train,y_train)
train_accuracy

0.7732142857142857

### Summary Table

In [14]:
# Table with weight and bias of features

In [16]:
summary = pd.DataFrame({'features': unscaled_features.columns.values})

In [17]:
summary['Coefficient'] = np.transpose(reg.coef_)

In [18]:
summary.index = summary.index + 1
summary.loc[0] = ['intercept', reg.intercept_[0]]


In [19]:
summary['Log odds'] = np.exp(summary['Coefficient'])

In [20]:
summary.sort_index()

Unnamed: 0,features,Coefficient,Log odds
0,intercept,-1.647455,0.192539
1,reason_type1,0.15893,1.172256
2,reason_type2,0.605284,1.831773
3,reason_type3,-0.169891,0.843757
4,reason_type4,0.279811,1.32288
5,Month value,0.348262,1.416604
6,Transportation Expense,-0.277396,0.757754
7,Age,2.800197,16.447892
8,Body Mass Index,0.951884,2.590585
9,Education,3.115553,22.545903


### Testing the model

In [21]:
test_accuracy = reg.score(x_test,y_test)
test_accuracy

0.75

### Saving the model

In [22]:
import pickle

In [23]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [24]:
with open('scaler','wb') as file:
    pickle.dump(feature_scaler, file)