## Importing the Data

In [2]:
#Importing libraries

import pandas as pd
import numpy as np
import calendar
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

In [3]:
'Create dataframe from CSV file'

file = 'Absenteeism_data.csv'
raw_data = pd.read_csv(file)

raw_data.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [4]:
# It is important to make a copy of the raw data so you can refer back to the original dataset

df = raw_data.copy()

In [5]:
#Inspecting the dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


## Data Preprocessing

In [6]:
#Dropping ID column

df.drop('ID', axis='columns', inplace=True)

In [7]:
# Other methods to note
# Can use pandas method get_dummies
# This shows what the categorical variables for each row

dummy_df = pd.get_dummies(df['Reason for Absence'])
dummy_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [8]:
# We can check that there are not missing values by ensuring every row is 1
dummy_df['Check'] = dummy_df.sum(axis=1)


# Furthermore, the sum of the check column should equal 700
dummy_df['Check'].sum(axis=0)

700

In [9]:
# Creating 4 reason columns using Numpy Where
# reason 1 = columns 1 to 14
# reason 2 = columns 15, 16, 17
# reason 3 = columns 18, 19, 20, 21
# reason 4 = columns 22 to 28


df['Reason 1'] = np.where((df['Reason for Absence'] < 15) & (df['Reason for Absence'] > 0), 1, 0)
df['Reason 2'] = np.where((df['Reason for Absence'] < 18) & (df['Reason for Absence'] > 14), 1, 0)
df['Reason 3'] = np.where((df['Reason for Absence'] < 22) & (df['Reason for Absence'] > 17), 1, 0)
df['Reason 4'] = np.where((df['Reason for Absence'] < 29) & (df['Reason for Absence'] > 21), 1, 0)
df.head()

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason 1,Reason 2,Reason 3,Reason 4
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [10]:
# Dropping 'Reason for Absence' column now that new columns have been created

df.drop('Reason for Absence', 1, inplace=True)

In [11]:
# How to change settings in pandas to the output returns all rows in the dataframe

#pd.options.display.max_rows = None

In [12]:
# Before extracting month and weekday from 'Date' column we must convert it to DateTime data type

df['Date'] = pd.to_datetime(df['Date'], format = '%d/%m/%Y')
df.dtypes

Date                         datetime64[ns]
Transportation Expense                int64
Distance to Work                      int64
Age                                   int64
Daily Work Load Average             float64
Body Mass Index                       int64
Education                             int64
Children                              int64
Pets                                  int64
Absenteeism Time in Hours             int64
Reason 1                              int32
Reason 2                              int32
Reason 3                              int32
Reason 4                              int32
dtype: object

In [13]:
#Extracting Month and weekday from the date

df['Month'] = pd.DatetimeIndex(df['Date']).month
#df['Month'] = df['Month'].apply(lambda x: calendar.month_name[x])
df['Weekday'] = pd.DatetimeIndex(df['Date']).weekday
#df['Weekday'] = df['Weekday'].apply(lambda x: calendar.day_name[x])
df[['Date', 'Month', 'Weekday']].head()

Unnamed: 0,Date,Month,Weekday
0,2015-07-07,7,1
1,2015-07-14,7,1
2,2015-07-15,7,2
3,2015-07-16,7,3
4,2015-07-23,7,3


In [14]:
#Checking to see if there are any errors
print(df['Weekday'].value_counts())
print(df['Month'].value_counts())

2    154
0    141
1    141
4    132
3    119
6      9
5      4
Name: Weekday, dtype: int64
3     87
2     72
10    71
11    63
5     58
7     55
8     54
4     53
9     53
1     50
12    49
6     35
Name: Month, dtype: int64


In [15]:
# Removing the 'Date' column

df.drop('Date', 1, inplace=True)

In [16]:
# Turning the 'Education' column into binary

df['Education'] = np.where((df['Education'] == 1), 0, 1)
df['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

In [16]:
# NOTE you can also use the .map() method to reassign numbers in a pandas dataframe column, it must be within a dictionary

# df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})

# The format of the dictionary is saying to replace the first number (left of colon) with the second number (right of colon)

In [17]:
# Reordering columns

reordered_columns = ['Reason 1', 'Reason 2', 'Reason 3',
       'Reason 4', 'Month', 'Weekday','Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets', 'Absenteeism Time in Hours']

df = df[reordered_columns]
df.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [18]:
# Saving preprocessed data to CSV file

#df.to_csv('Preprocessed_DF.csv')

## Machine Learning

They say machine learning is 90% preprocessing and 10% modelling

We will take a logistic regression approach to analysing the dataset on absenteeism. The variables we will look at are:

Reason for absence

Daily work load average

Children

Pet 

Distance from work

### Creating Targets

In [19]:
df.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [20]:
# We will find the median of the 'Absenteeism Time in Hours' column
# Need to classify this column into 'Moderately absent' & 'Excessively absent'
# Anything above the median is excessive and anything below is moderate
# We will make a new variable named 'Excessive Absenteeism' to identify if an employee has been asbsent for more than 3 hours

median = df['Absenteeism Time in Hours'].median()
df['Excessive Absenteeism'] = np.where(df['Absenteeism Time in Hours'] > median, 1, 0)

In [21]:
#Drop 'Absenteeism Time in Hours' column after making 'Excessive Absenteeism' column

df.drop('Absenteeism Time in Hours', axis=1, inplace=True)

# THIS IS A NOTE AFTER COMPLETING STANDARDISATION AND LOOKING AT COLUMNS COEFFICIENTS
# Unimportant variables will be removed to simplify the model
# 'weekday', 'daily work load average' and 'distance to work' will be removed

df.drop(['Weekday', 'Daily Work Load Average', 'Distance to Work'], axis=1, inplace=True)

### Selecting Inputs for the Regression

In [22]:
# Selecting all columns excluding 'Excessive Absenteeism'

#df.iloc[:, 0:14]
#or
unscaled_inputs = df.iloc[:, :-1]

In [23]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [24]:
# The Standard Scaler will calculate the mean and standard deviation
# It will then subtract the mean and divide by the standard deviation

# scaler = StandardScaler()

# THERE IS A PROBLEM WITH USING STANDARD SCALER ON ALL VARIABLES
# A dummy variable is a variable that only possesses the value of 1 or 0
# to indicate the absence or presence of a categorical variable
# We made reason 1,2,3,4 and education column dummy variables
# When using standard scaler we scaled all the input variables including the dummies
# We do not want to standardise the dummy variables
# Hence we must make a customscaler to ommit the dummies from standardisation

from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.scaler = StandardScaler()
        self.columns = columns
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]


In [25]:
columns_to_scale = ['Month', 'Transportation Expense', 'Age', 'Body Mass Index', 'Children', 'Pets']

In [26]:
# Calculating the mean and standard deviation of the data, preparing for scaling
# the '.fit' method trains trains the scaler
# the '.transform' method applies the scaler to the data

# CANNOT USE STANDARD SCALER AS THE DUMMY VARIABLES WILL BE SCALED
#scaler.fit(unscaled_inputs)

scaler = CustomScaler(columns_to_scale)

In [27]:
scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'])

In [28]:
# Now we need to scale the data by applying the mean and standard deviation calculation

scaled_inputs = scaler.transform(unscaled_inputs)

In [29]:
scaled_inputs.head()

# Here we can see all the variables have been standardised
# However the dummy variables remain untouched :)

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.01928,-0.58969
2,0,0,0,1,0.182726,-0.654143,0.24831,1.002633,0,-0.91903,-0.58969
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.58969
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487


In [30]:
scaled_inputs.shape

(700, 11)

### Adressing Overfitting and Underfitting with train test split

In [31]:
# Splitting the scaled data into train and test to ensure the model is not overfitted

x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, df['Excessive Absenteeism'], train_size = 0.8, random_state = 7)

### Training Logistic Regression Model with sklearn

In [32]:
# Creating a logistic regression object

reg = LogisticRegression()

In [33]:
# Training the model using the specified data

reg.fit(x_train, y_train)

LogisticRegression()

In [34]:
reg.score(x_train, y_train)

# This means roughly 77% of predictions match the targets

0.7785714285714286

### Manually Checking the Accuracy

In [35]:
model_predictions = reg.predict(x_train)
model_predictions.shape

(560,)

In [36]:
comparison = model_predictions == y_train
print(comparison.sum())
print(len(comparison))

comparison.sum() / len(comparison)

436
560


0.7785714285714286

### Finding the coefficients and the intercept of the model

In [37]:
print(reg.coef_)
print(reg.intercept_)

[[ 3.00398751  0.59137145  3.12755008  0.92234023  0.06678678  0.61619444
  -0.17884299  0.32343708  0.11237994  0.40757103 -0.3091114 ]]
[-1.73223505]


In [38]:
# Creating a summary DataFrame with coefficients and intercept

feature_name = unscaled_inputs.columns.transpose()
coefficients = reg.coef_

summary_df = pd.DataFrame(columns=['Feature', 'Coefficient'])

summary_df['Feature'] = feature_name
summary_df['Coefficient'] = coefficients.transpose()

# Shift all index values down one to add intercept at top of table

summary_df.index = summary_df.index + 1

summary_df.loc[0] = ['Intercept', reg.intercept_[0]]
summary_df.sort_index(inplace=True)

summary_df

# Whichever weight is closer to zero means that it is less significant when contributing to the predicted variable

Unnamed: 0,Feature,Coefficient
0,Intercept,-1.732235
1,Reason 1,3.003988
2,Reason 2,0.591371
3,Reason 3,3.12755
4,Reason 4,0.92234
5,Month,0.066787
6,Transportation Expense,0.616194
7,Age,-0.178843
8,Body Mass Index,0.323437
9,Education,0.11238


In [39]:
# Finding the log (odds)??? equals the exponential of the coefficients
# Make a column in the summary table to calculate the log odds

summary_df['Odds Ratio'] = np.exp(summary_df['Coefficient'])
summary_df

Unnamed: 0,Feature,Coefficient,Odds Ratio
0,Intercept,-1.732235,0.176889
1,Reason 1,3.003988,20.165788
2,Reason 2,0.591371,1.806464
3,Reason 3,3.12755,22.818009
4,Reason 4,0.92234,2.51517
5,Month,0.066787,1.069068
6,Transportation Expense,0.616194,1.851867
7,Age,-0.178843,0.836237
8,Body Mass Index,0.323437,1.381869
9,Education,0.11238,1.118938


In [40]:
summary_df.sort_values('Odds Ratio', ascending=False)

# If a coefficient in close to zero or its odds ratio is around 1 the feature is not particularly important

Unnamed: 0,Feature,Coefficient,Odds Ratio
3,Reason 3,3.12755,22.818009
1,Reason 1,3.003988,20.165788
4,Reason 4,0.92234,2.51517
6,Transportation Expense,0.616194,1.851867
2,Reason 2,0.591371,1.806464
10,Children,0.407571,1.503162
8,Body Mass Index,0.323437,1.381869
9,Education,0.11238,1.118938
5,Month,0.066787,1.069068
7,Age,-0.178843,0.836237


In [43]:
# We can remove certain variables to simplify the model
# If the p-value for a coefficient is greater than 5% then it is removed
# WE WILL GO BACK TO 'CREATING TARGETS' SECTION TO OMIT THESE CERTAIN COLUMNS

### Testing the Model

In [41]:
# We must test the model with data it has not seen before
# this will be done using the test datasets from the train test split function

reg.score(x_test,y_test)

# This score informs us that 74% of cases the model predicts will be correct

0.7428571428571429

In [46]:
# We can predict the probability of the outcome being 0 or 1?

predicted_probability = reg.predict_proba(x_test)
predicted_probability[0:10]

array([[0.77003724, 0.22996276],
       [0.8802694 , 0.1197306 ],
       [0.46409482, 0.53590518],
       [0.30929935, 0.69070065],
       [0.22130703, 0.77869297],
       [0.74555486, 0.25444514],
       [0.75404173, 0.24595827],
       [0.64064383, 0.35935617],
       [0.67210105, 0.32789895],
       [0.17222137, 0.82777863]])

In [47]:
# First column predicts the probability of the observation being 0
# Second column predicts the probability of the obsercation being 1

predicted_probability.shape

# Why are there 140 rows?

(140, 2)

### Saving the Model

 We need to save all the machine learning information such as:
 
 the type of regression
 
 the relevant coefficients
 
 the intercept
 
 the random state used in the train test split
 
 ...etc
 
 This can be done by looking at the LogisticRegression object earlier on in the code

In [48]:
# In order to save the model we must import the python library 'pickle'

import pickle

In [49]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [50]:
# We also need to save the scaler

with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)

Both these stages are important because when the model is used on new data, the same preprocessing is conducted.

The second step of model deployment is to create a way to load the saved model and make predictions

This can be done by storing code in a module. This will allows us to reuse without any trouble with different datasets.