In [1]:
import numpy as np
import pandas as pd

# Load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,0,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,0,7,3,289,36,33,239.554,30,0,2,1,2


# Create the targets

In [6]:
# Get the median of "Absenteeism Time in Hours" 
# and set it as the cuttoff-line between excessive absenteeism and normal absenteeism
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1,0)

# Add the excessive Absenteeism column to data preprocessed
data_preprocessed['Excessive Absentism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absentism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,0,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,0,7,3,289,36,33,239.554,30,0,2,1,2,0


In [11]:
# Drop the 'Absenteeism time in hours' column
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis = 1)
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absentism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,0,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,0,7,3,289,36,33,239.554,30,0,2,1,0


# Select the inputs

In [15]:
unscaled_inputs = data_with_targets.iloc[:, :-1]
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,0,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,0,7,3,289,36,33,239.554,30,0,2,1


# Standardise the inputs

In [22]:
from sklearn.preprocessing import StandardScaler

absenteeism_scaler = StandardScaler()
absenteeism_scaler.fit(unscaled_inputs)

StandardScaler()

In [24]:
# Transform the scaled inputs
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

array([[-0.54212562, -0.0758098 , -0.34381807, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.54212562, -0.0758098 , -0.34381807, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.54212562, -0.0758098 , -0.34381807, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.84459094, -0.0758098 , -0.34381807, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.54212562, -0.0758098 , -0.34381807, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.54212562, -0.0758098 , -0.34381807, ..., -0.44798003,
        -0.01928035,  0.26848661]])

# Split the dataset into train and test

In [26]:
from sklearn.model_selection import train_test_split # import the train_test_split to spli the data set

#Declare 4 variables to store the train and test data
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size = 0.2, random_state =20 )

In [27]:
x_train.shape

(560, 14)

In [31]:
x_test.shape

(140, 14)

# Logistic Regression

In [33]:
# import the logistic regression model from sklearn
from sklearn.linear_model import LogisticRegression

# Create the logistic regression object
reg = LogisticRegression()

# Fit the model with training data
reg.fit(x_train, y_train)

LogisticRegression()

In [35]:
# Assess the training accuracy of the model
reg.score(x_train, y_train)

0.7535714285714286

# Finding the intercept and the coefficients

In [37]:
reg.intercept_ # get the intercept

array([-0.14350803])

In [38]:
# Get the weigths
reg.coef_

array([[ 0.80240677,  0.00259014,  0.92157509,  0.04746096,  0.22819116,
        -0.1319634 ,  0.53124685, -0.01778622, -0.06990101,  0.0209045 ,
         0.17599292, -0.00961032,  0.30841893, -0.37283611]])

In [47]:
feature_name = unscaled_inputs.columns.values

In [52]:
summary_table = pd.DataFrame(columns = ['Feature_name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature_name,Coefficient
0,Reason_1,0.802407
1,Reason_2,0.00259
2,Reason_3,0.921575
3,Reason_4,0.047461
4,Month Value,0.228191
5,Day of the week,-0.131963
6,Transportation Expense,0.531247
7,Distance to Work,-0.017786
8,Age,-0.069901
9,Daily Work Load Average,0.020905
