In [1]:
import numpy as np
import pandas as pd

# Load the data

In [2]:
data = pd.read_csv('Absenteeism_preprocessed.csv')

# Creating the targets

In [3]:
median = data['Absenteeism Time in Hours'].median()

In [4]:
data['Excessive Absenteeism'] = np.where(data['Absenteeism Time in Hours'] > median, 1, 0)

In [5]:
data_with_targets = data.drop(['Absenteeism Time in Hours', 'Day of the Week', 'Daily Work Load Average', 'Distance to Work'], axis=1)
data_with_targets = data_with_targets[['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index',
       'Children', 'Pets', 'Excessive Absenteeism']]

In [6]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Education,Month Value,Transportation Expense,Age,Body Mass Index,Children,Pets,Excessive Absenteeism
0,0,0,0,1,0,7,289,33,30,2,1,1
1,0,0,0,0,0,7,118,50,31,1,0,0
2,0,0,0,1,0,7,179,38,31,0,0,0
3,1,0,0,0,0,7,279,39,24,2,0,1
4,0,0,0,1,0,7,289,33,30,2,1,0


# Logistic Regression

In [7]:
columns_to_scale = data_with_targets.iloc[: , 5:-1].columns

inputs = data_with_targets[columns_to_scale]
targets = data_with_targets['Excessive Absenteeism']

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [9]:
scaler.fit(inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [10]:
inputs_scaled = data_with_targets.iloc[:, :-1]
inputs_scaled[columns_to_scale] = scaler.transform(inputs)
inputs_scaled

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Education,Month Value,Transportation Expense,Age,Body Mass Index,Children,Pets
0,0,0,0,1,0,0.182726,1.005844,-0.536062,0.767431,0.880469,0.268487
1,0,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,-0.019280,-0.589690
2,0,0,0,1,0,0.182726,-0.654143,0.248310,1.002633,-0.919030,-0.589690
3,1,0,0,0,0,0.182726,0.854936,0.405184,-0.643782,0.880469,-0.589690
4,0,0,0,1,0,0.182726,1.005844,-0.536062,0.767431,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,1,-0.388293,-0.654143,0.562059,-1.114186,0.880469,-0.589690
696,1,0,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,-0.019280,1.126663
697,1,0,0,0,1,-0.388293,1.624567,-1.320435,-0.408580,-0.919030,-0.589690
698,0,0,0,1,1,-0.388293,0.190942,-0.692937,-0.408580,-0.919030,-0.589690


In [11]:
from sklearn.model_selection import train_test_split as tts

In [12]:
x_train, x_test, y_train, y_test = tts(inputs_scaled, targets, train_size=0.8, random_state=20)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [14]:
reg = LogisticRegression()

In [15]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
reg.score(x_train, y_train)

0.775

In [17]:
model_outputs = reg.predict(x_train)

In [18]:
reg.intercept_

array([-1.46547112])

In [19]:
reg.coef_

array([[ 2.62749942,  0.86338637,  2.96050661,  0.66390745, -0.23452541,
         0.15493732,  0.59979822, -0.17245127,  0.27568526,  0.34249662,
        -0.2775137 ]])

In [20]:
feature_names = inputs_scaled.columns.values

In [21]:
summary_table = pd.DataFrame(columns=['Feature Name'], data=feature_names)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table = pd.concat([pd.DataFrame(columns=['Feature Name', 'Coefficient'], data=[['Intercept', reg.intercept_[0]]]), summary_table], ignore_index=True)
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.465471
1,Reason_1,2.627499
2,Reason_2,0.863386
3,Reason_3,2.960507
4,Reason_4,0.663907
5,Education,-0.234525
6,Month Value,0.154937
7,Transportation Expense,0.599798
8,Age,-0.172451
9,Body Mass Index,0.275685


In [22]:
summary_table.set_index('Feature Name')

Unnamed: 0_level_0,Coefficient
Feature Name,Unnamed: 1_level_1
Intercept,-1.465471
Reason_1,2.627499
Reason_2,0.863386
Reason_3,2.960507
Reason_4,0.663907
Education,-0.234525
Month Value,0.154937
Transportation Expense,0.599798
Age,-0.172451
Body Mass Index,0.275685


In [23]:
summary_table['Odds Ratio'] = np.exp(summary_table.Coefficient)

In [24]:
summary_table.sort_values('Odds Ratio', ascending=False)

Unnamed: 0,Feature Name,Coefficient,Odds Ratio
3,Reason_3,2.960507,19.307751
1,Reason_1,2.627499,13.839121
2,Reason_2,0.863386,2.371177
4,Reason_4,0.663907,1.942367
7,Transportation Expense,0.599798,1.821751
10,Children,0.342497,1.40846
9,Body Mass Index,0.275685,1.317433
6,Month Value,0.154937,1.167585
8,Age,-0.172451,0.841599
5,Education,-0.234525,0.790946


# Testing 

In [25]:
reg.score(x_test, y_test)

0.75

# Saving the model

In [26]:
import pickle

In [27]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [28]:
with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)