In [1]:
import numpy as np
import pandas as pd

# Load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,0,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,0,7,3,289,36,33,239.554,30,0,2,1,2


# Create the targets

In [3]:
# Get the median of "Absenteeism Time in Hours" 
# and set it as the cuttoff-line between excessive absenteeism and normal absenteeism
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1,0)

# Add the excessive Absenteeism column to data preprocessed
data_preprocessed['Excessive Absentism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absentism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,0,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,0,7,3,289,36,33,239.554,30,0,2,1,2,0


In [4]:
# Drop the 'Absenteeism time in hours' column
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Education'],axis=1)
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets,Excessive Absentism
0,0,0,0,1,7,1,289,36,33,239.554,30,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,1,0,0
2,0,0,0,0,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,2,0,1
4,0,0,0,0,7,3,289,36,33,239.554,30,2,1,0


# Select the inputs

In [5]:
unscaled_inputs = data_with_targets.iloc[:, :-1]
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,1,0
2,0,0,0,0,7,2,179,51,38,239.554,31,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,2,0
4,0,0,0,0,7,3,289,36,33,239.554,30,2,1


# Standardise the inputs

In [6]:
#from sklearn.preprocessing import StandardScaler

#absenteeism_scaler = StandardScaler()

#absenteeism_scaler.fit(unscaled_inputs)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [8]:
unscaled_inputs.columns.values


array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children',
       'Pets'], dtype=object)

In [9]:
# Choose the columns to scale
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]
columns_to_scale

['Month Value',
 'Day of the week',
 'Transportation Expense',
 'Distance to Work',
 'Age',
 'Daily Work Load Average',
 'Body Mass Index',
 'Children',
 'Pets']

In [10]:
# Declare the scaler object
absenteeism_scaler = CustomScaler(columns_to_scale)

#fit the data in the scaler object
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Month Value', 'Day of the week',
                      'Transportation Expense', 'Distance to Work', 'Age',
                      'Daily Work Load Average', 'Body Mass Index', 'Children',
                      'Pets'],
             copy=None, with_mean=None, with_std=None)

In [11]:
# Transform the scaled inputs
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.019280,-0.589690
2,0,0,0,0,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0.880469,-0.589690
4,0,0,0,0,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,0.880469,-0.589690
696,1,0,0,0,-0.388293,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,-0.019280,1.126663
697,1,0,0,0,-0.388293,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,-0.919030,-0.589690
698,0,0,0,0,-0.388293,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,-0.919030,-0.589690


# Split the dataset into train and test

In [12]:
from sklearn.model_selection import train_test_split # import the train_test_split to spli the data set

#Declare 4 variables to store the train and test data
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size = 0.2, random_state =20 )

In [13]:
x_train.shape

(560, 13)

In [14]:
x_test.shape

(140, 13)

# Logistic Regression

In [15]:
# import the logistic regression model from sklearn
from sklearn.linear_model import LogisticRegression

from sklearn import metrics

# Create the logistic regression object
reg = LogisticRegression()

# Fit the model with training data
reg.fit(x_train, y_train)

LogisticRegression()

In [16]:
# Assess the training accuracy of the model
reg.score(x_train, y_train)

0.7589285714285714

# Finding the intercept and the coefficients

In [17]:
reg.intercept_ # get the intercept

array([-0.82105103])

In [18]:
# Get the weigths
reg.coef_

array([[ 1.74342262, -0.04225537,  2.48931531, -0.00410181,  0.21113033,
        -0.13643367,  0.52054435, -0.00995161, -0.07015608,  0.02804595,
         0.17348033,  0.31034941, -0.35317961]])

In [19]:
feature_name = unscaled_inputs.columns.values

In [20]:
summary_table = pd.DataFrame(columns = ['Feature_name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature_name,Coefficient
0,Reason_1,1.743423
1,Reason_2,-0.042255
2,Reason_3,2.489315
3,Reason_4,-0.004102
4,Month Value,0.21113
5,Day of the week,-0.136434
6,Transportation Expense,0.520544
7,Distance to Work,-0.009952
8,Age,-0.070156
9,Daily Work Load Average,0.028046


In [21]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# Sort the dataframe by index
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature_name,Coefficient
0,Intercept,-0.821051
1,Reason_1,1.743423
2,Reason_2,-0.042255
3,Reason_3,2.489315
4,Reason_4,-0.004102
5,Month Value,0.21113
6,Day of the week,-0.136434
7,Transportation Expense,0.520544
8,Distance to Work,-0.009952
9,Age,-0.070156


# intepreting  the coefficients

In [22]:
summary_table['Odds_ratio'] = np.exp(summary_table['Coefficient'])
summary_table

Unnamed: 0,Feature_name,Coefficient,Odds_ratio
0,Intercept,-0.821051,0.439969
1,Reason_1,1.743423,5.716877
2,Reason_2,-0.042255,0.958625
3,Reason_3,2.489315,12.053021
4,Reason_4,-0.004102,0.995907
5,Month Value,0.21113,1.235073
6,Day of the week,-0.136434,0.872464
7,Transportation Expense,0.520544,1.682944
8,Distance to Work,-0.009952,0.990098
9,Age,-0.070156,0.932248


In [23]:
# sort the odds_ratio in decending order
sorted_summary = summary_table.sort_values('Odds_ratio', ascending = False)
sorted_summary

Unnamed: 0,Feature_name,Coefficient,Odds_ratio
3,Reason_3,2.489315,12.053021
1,Reason_1,1.743423,5.716877
7,Transportation Expense,0.520544,1.682944
12,Children,0.310349,1.363902
5,Month Value,0.21113,1.235073
11,Body Mass Index,0.17348,1.189437
10,Daily Work Load Average,0.028046,1.028443
4,Reason_4,-0.004102,0.995907
8,Distance to Work,-0.009952,0.990098
2,Reason_2,-0.042255,0.958625


# Testing the model

In [24]:
reg.score(x_test, y_test)

0.7357142857142858

In [25]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.73229562, 0.26770438],
       [0.6239295 , 0.3760705 ],
       [0.41482169, 0.58517831],
       [0.82068926, 0.17931074],
       [0.3647582 , 0.6352418 ],
       [0.35539569, 0.64460431],
       [0.39463388, 0.60536612],
       [0.1201861 , 0.8798139 ],
       [0.8338429 , 0.1661571 ],
       [0.74633845, 0.25366155],
       [0.11093484, 0.88906516],
       [0.02662334, 0.97337666],
       [0.0720725 , 0.9279275 ],
       [0.15068779, 0.84931221],
       [0.75349086, 0.24650914],
       [0.59235779, 0.40764221],
       [0.67268056, 0.32731944],
       [0.16170886, 0.83829114],
       [0.35869643, 0.64130357],
       [0.04575004, 0.95424996],
       [0.71413691, 0.28586309],
       [0.80671398, 0.19328602],
       [0.36400986, 0.63599014],
       [0.40768355, 0.59231645],
       [0.26552387, 0.73447613],
       [0.7796031 , 0.2203969 ],
       [0.52487139, 0.47512861],
       [0.81939801, 0.18060199],
       [0.14631573, 0.85368427],
       [0.79192563, 0.20807437],
       [0.

In [26]:
predicted_proba[:,1]

array([0.26770438, 0.3760705 , 0.58517831, 0.17931074, 0.6352418 ,
       0.64460431, 0.60536612, 0.8798139 , 0.1661571 , 0.25366155,
       0.88906516, 0.97337666, 0.9279275 , 0.84931221, 0.24650914,
       0.40764221, 0.32731944, 0.83829114, 0.64130357, 0.95424996,
       0.28586309, 0.19328602, 0.63599014, 0.59231645, 0.73447613,
       0.2203969 , 0.47512861, 0.18060199, 0.85368427, 0.20807437,
       0.86884958, 0.61389891, 0.67559884, 0.90083436, 0.19328602,
       0.89209131, 0.17023983, 0.24883542, 0.41991561, 0.54633328,
       0.23452956, 0.46939953, 0.18064642, 0.57521689, 0.7553792 ,
       0.14335664, 0.64710444, 0.29246402, 0.21063538, 0.17114517,
       0.66859675, 0.58911546, 0.64292267, 0.33837098, 0.82823687,
       0.49504072, 0.85619411, 0.27140103, 0.4895202 , 0.4764303 ,
       0.7333078 , 0.61594911, 0.34176472, 0.2532011 , 0.17182542,
       0.27742495, 0.19820376, 0.20286383, 0.79523418, 0.87728279,
       0.17407112, 0.53650529, 0.88354162, 0.40833506, 0.55482

# Save the model

In [27]:
import pickle

In [28]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [29]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)