# Model Building

In [76]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn import metrics

In [20]:
df = pd.read_csv('Absenteeism_preprocessed.csv')

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,reason_1,reason_2,reason_3,reason_4,month,Week_of_the_Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [22]:
#deriving target column
#checking median to classify the targets
df['Absenteeism Time in Hours'].median()

3.0

In [23]:
target = np.where(df['Absenteeism Time in Hours'] > df['Absenteeism Time in Hours'].median(),1,0)

In [60]:
target

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [24]:
df['Absenteeism'] = target

In [25]:
df.head()

Unnamed: 0.1,Unnamed: 0,reason_1,reason_2,reason_3,reason_4,month,Week_of_the_Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Absenteeism
0,0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [27]:
target.sum() / target.shape[0]

0.45571428571428574

In [28]:
df = df.drop(['Absenteeism Time in Hours'], axis = 1)

In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,reason_1,reason_2,reason_3,reason_4,month,Week_of_the_Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism
0,0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


### Creating a checkpoint

In [51]:
#Creating a checkpoint
df_target = df

In [52]:
df_target is df

True

In [53]:
#Creating input for model building
df_input = df_target.iloc[:,:-1]

### Standardizing the input dataframe

In [54]:
df_scaler = StandardScaler()

In [55]:
df_scaler.fit(df_input)

In [56]:
scaled_data = df_scaler.transform(df_input)

In [57]:
scaled_data

array([[-1.72957821, -0.57735027, -0.09298136, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-1.72462949, -0.57735027, -0.09298136, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-1.71968077, -0.57735027, -0.09298136, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.71968077,  1.73205081, -0.09298136, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [ 1.72462949, -0.57735027, -0.09298136, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [ 1.72957821, -0.57735027, -0.09298136, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [58]:
scaled_data.shape

(700, 15)

### Test train split

In [65]:
x_train,x_test,y_train,y_test = train_test_split(scaled_data, target, test_size = 0.25, train_size = 0.75, random_state = 10)

In [66]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(525, 15) (175, 15) (525,) (175,)


## Building Model

In [80]:
#Logistic Regression model
lor = LogisticRegression()

In [81]:
lor.fit(x_train,y_train)

In [82]:
lor.score(x_train,y_train)

0.780952380952381

In [83]:
 #Random forest model
rfc= RandomForestClassifier(n_estimators= 10, criterion="entropy")  
rfc.fit(x_train, y_train)  

In [84]:
rfc.score(x_train,y_train)

0.9885714285714285

In [88]:
#Support Vector classifier
svc = SVC()
svc.fit(x_train,y_train)

In [89]:
svc.score(x_train,y_train)

0.8247619047619048