# Absenteeism Project - Machine Learning Model Notebook

### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

### Viewing preprocessed absenteeism dataset

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Preprocessed dataset manipulation

In [3]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [4]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

In [5]:
data_preprocessed['Excessive Absenteeism'] = targets

In [6]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [7]:
data_with_targets = data_preprocessed.drop(columns=['Absenteeism Time in Hours','Distance to Work','Day of the Week','Daily Work Load Average'], axis=1)

In [8]:
data_with_targets.shape

(700, 12)

### Splitting the data into inputs (X) and target (y)

In [9]:
X = data_with_targets.iloc[:,:-1]
y = data_with_targets.iloc[:,-1]

### Standardisation of the data

In [10]:
class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns= self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [11]:
X.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [12]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [13]:
columns_to_scale = [x for x in X.columns.values if x not in columns_to_omit]

In [14]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [15]:
absenteeism_scaler.fit(X)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [16]:
X_scaled = absenteeism_scaler.transform(X)

In [17]:
X_scaled.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.01928,-0.58969
2,0,0,0,1,0.182726,-0.654143,0.24831,1.002633,0,-0.91903,-0.58969
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.58969
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487


### Splitting the dataset into training and test data

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=1)

In [19]:
print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)

Train shape: (560, 11)
Test shape: (140, 11)


### Create a Logistic Regression Model

In [20]:
reg = LogisticRegression()

reg.fit(X_train, y_train)

#### Checking the accuracy of the training dataset

In [21]:
reg.score(X_train, y_train)

0.7732142857142857

#### Intercept of Logistic Regression (bias)

In [22]:
reg.intercept_

array([-1.69309236])

#### Coefficients of Logistic Regression (weights)

In [23]:
reg.coef_

array([[ 2.80286002,  0.99463187,  3.09750083,  0.84501681,  0.10626722,
         0.57134194, -0.25139403,  0.28515017, -0.04535719,  0.46516189,
        -0.29218653]])

In [24]:
feature_names = X.columns.values

#### Create a summary table to view intercept and coefficients 

In [25]:
summary_table = pd.DataFrame(columns=['Feature Names'], data = feature_names)
summary_table['Coefficients'] = reg.coef_.T
summary_table

Unnamed: 0,Feature Names,Coefficients
0,Reason_1,2.80286
1,Reason_2,0.994632
2,Reason_3,3.097501
3,Reason_4,0.845017
4,Month Value,0.106267
5,Transportation Expense,0.571342
6,Age,-0.251394
7,Body Mass Index,0.28515
8,Education,-0.045357
9,Children,0.465162


In [26]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Names,Coefficients
0,Intercept,-1.693092
1,Reason_1,2.80286
2,Reason_2,0.994632
3,Reason_3,3.097501
4,Reason_4,0.845017
5,Month Value,0.106267
6,Transportation Expense,0.571342
7,Age,-0.251394
8,Body Mass Index,0.28515
9,Education,-0.045357


#### Create an odds ratio for the intercept and coefficients

In [27]:
summary_table['odds_ratio'] = np.exp(summary_table['Coefficients'])
summary_table

Unnamed: 0,Feature Names,Coefficients,odds_ratio
0,Intercept,-1.693092,0.18395
1,Reason_1,2.80286,16.491746
2,Reason_2,0.994632,2.703729
3,Reason_3,3.097501,22.142544
4,Reason_4,0.845017,2.328017
5,Month Value,0.106267,1.112119
6,Transportation Expense,0.571342,1.770642
7,Age,-0.251394,0.777716
8,Body Mass Index,0.28515,1.329962
9,Education,-0.045357,0.955656


In [28]:
summary_table.sort_values('odds_ratio', ascending=False)

Unnamed: 0,Feature Names,Coefficients,odds_ratio
3,Reason_3,3.097501,22.142544
1,Reason_1,2.80286,16.491746
2,Reason_2,0.994632,2.703729
4,Reason_4,0.845017,2.328017
6,Transportation Expense,0.571342,1.770642
10,Children,0.465162,1.592272
8,Body Mass Index,0.28515,1.329962
5,Month Value,0.106267,1.112119
9,Education,-0.045357,0.955656
7,Age,-0.251394,0.777716


### Testing the Model and comparing the accuracy between training data and testing data

In [29]:
reg.score(X_test, y_test)

0.75

In [30]:
predict_proba = reg.predict_proba(X_test)
predict_proba[:,1]

array([0.73698322, 0.24658015, 0.24735738, 0.14943394, 0.43914455,
       0.12431729, 0.66978766, 0.8465907 , 0.23548107, 0.86586826,
       0.29437846, 0.86255739, 0.38648354, 0.67063373, 0.64619699,
       0.15739087, 0.23080392, 0.61830484, 0.13830111, 0.20724059,
       0.29437846, 0.48905875, 0.55448172, 0.35153598, 0.48168735,
       0.16659986, 0.28811572, 0.240987  , 0.12945352, 0.37255451,
       0.80279225, 0.43043521, 0.5017523 , 0.48625241, 0.78878055,
       0.37967378, 0.32439299, 0.200076  , 0.22020742, 0.42301318,
       0.25304908, 0.77237948, 0.2688731 , 0.85725919, 0.56185401,
       0.64935467, 0.74892491, 0.63923001, 0.79340095, 0.20996461,
       0.23548107, 0.24735738, 0.23623429, 0.81693607, 0.13814236,
       0.200076  , 0.12780653, 0.23548107, 0.25225996, 0.23623429,
       0.26295077, 0.64619699, 0.63541604, 0.85725919, 0.47678349,
       0.74120508, 0.72021722, 0.36080821, 0.30052411, 0.13456957,
       0.22546154, 0.28811572, 0.19526431, 0.87119601, 0.66671

### Saving the Model

In [31]:
import pickle 

In [32]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [33]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)