### Creating a logistic regression to predict absenteeism

### Import the relevant libraryies

In [1]:
import numpy as np
import pandas as pd
import sklearn

### Load the data

In [2]:
data_preprocessed = pd.read_csv('df_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Create the targets

In [4]:
meidan = data_preprocessed['Absenteeism Time in Hours'].median()

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours']>meidan, 1, 0)

In [6]:
np.unique(targets)

array([0, 1])

In [7]:
data_preprocessed['Excessive_Absenteeism'] = targets

In [8]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive_Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


### A comment on the targets

In [9]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [10]:
# backward eilimination
# come back to here to drop off the columns 'Daily Work Load Average','Day of the Week', 'Distance to Work'.
# rerun the notebook

In [11]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Daily Work Load Average',
                                            'Day of the Week', 'Distance to Work'], axis=1)

In [12]:
data_with_targets is data_preprocessed

False

In [13]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive_Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


### Select the inputs for the regression

In [14]:
data_with_targets.shape

(700, 12)

In [15]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [16]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

### Standardize the data

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
# The datndard scaler will scale the dummies variables, 
#     which will make the dummies variables uninterpretiable in the model.

# absenteeism_scaler = StandardScaler()  # obtian the mean and std

#### custome scaler

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    def __init__(self,columns):  
        
#     def __init__(self,columns,copy=True,with_mean=True,with_std=True):
#         self.scaler = StandardScaler(copy,with_mean,with_std)


        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler()
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [20]:
# select columns that we don't want to scale
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [21]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [22]:
absenteeism_scaler.fit(unscaled_inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'])

In [23]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [24]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


## Split the data into train & test and shuffle

### import the relevant module

In [25]:
from sklearn.model_selection import train_test_split

### Split

In [26]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 526         1         0         0         0     1.039256   
 686         1         0         0         0    -0.388293   
 598         0         0         0         1    -1.244823   
 602         0         0         0         1    -1.244823   
 395         0         0         0         1    -0.959313   
 ..        ...       ...       ...       ...          ...   
 414         0         0         0         1    -0.673803   
 151         0         0         1         0    -1.244823   
 7           0         0         0         1     0.182726   
 503         0         0         0         1     0.753746   
 236         0         0         1         0     0.182726   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 526                0.040034 -1.320435        -0.643782          0 -0.019280   
 686               -1.574681  2.130803         1.002633          0 -0.019280   
 598               -0.6541

In [27]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, 
                                                    targets, 
                                                    train_size=0.8, 
                                                    shuffle=True,
                                                    random_state=20
                                                   )

In [28]:
print(x_train.shape,y_train.shape)
print(x_test.shape, y_test.shape)

(560, 11) (560,)
(140, 11) (140,)


## Logistic regression with sklearn

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [30]:
# reg =  LogisticRegression()
reg = LogisticRegression(solver='liblinear')


In [31]:
reg.fit(x_train, y_train)

LogisticRegression(solver='liblinear')

In [32]:
reg.score(x_train, y_train)

0.775

## Manuall check the accuracy

In [33]:
model_output =  reg.predict(x_train)
model_output

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [34]:
model_output == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [35]:
np.sum(model_output == y_train)

434

In [36]:
model_output.shape[0]

560

In [37]:
np.sum(model_output==y_train)/ model_output.shape[0]

0.775

### Finding the intercept and coefficients

In [38]:
reg.intercept_

array([-1.46547112])

In [39]:
reg.coef_

array([[ 2.62749942,  0.86338637,  2.96050661,  0.66390745,  0.15493732,
         0.59979822, -0.17245127,  0.27568526, -0.23452541,  0.34249662,
        -0.2775137 ]])

In [40]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [41]:
feature_name = unscaled_inputs.columns.values

In [42]:
summary_table = pd.DataFrame(columns=['Feature_name'], data = feature_name)
summary_table['Coef'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature_name,Coef
0,Reason_1,2.627499
1,Reason_2,0.863386
2,Reason_3,2.960507
3,Reason_4,0.663907
4,Month Value,0.154937
5,Transportation Expense,0.599798
6,Age,-0.172451
7,Body Mass Index,0.275685
8,Education,-0.234525
9,Children,0.342497


In [43]:
summary_table.index = summary_table.index +1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table =  summary_table.sort_index()
summary_table

Unnamed: 0,Feature_name,Coef
0,Intercept,-1.465471
1,Reason_1,2.627499
2,Reason_2,0.863386
3,Reason_3,2.960507
4,Reason_4,0.663907
5,Month Value,0.154937
6,Transportation Expense,0.599798
7,Age,-0.172451
8,Body Mass Index,0.275685
9,Education,-0.234525


### Interpreting the coefficients

In [44]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coef)

In [45]:
summary_table

Unnamed: 0,Feature_name,Coef,Odds_ratio
0,Intercept,-1.465471,0.230969
1,Reason_1,2.627499,13.839121
2,Reason_2,0.863386,2.371177
3,Reason_3,2.960507,19.307751
4,Reason_4,0.663907,1.942367
5,Month Value,0.154937,1.167585
6,Transportation Expense,0.599798,1.821751
7,Age,-0.172451,0.841599
8,Body Mass Index,0.275685,1.317433
9,Education,-0.234525,0.790946


In [46]:
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Feature_name,Coef,Odds_ratio
3,Reason_3,2.960507,19.307751
1,Reason_1,2.627499,13.839121
2,Reason_2,0.863386,2.371177
4,Reason_4,0.663907,1.942367
6,Transportation Expense,0.599798,1.821751
10,Children,0.342497,1.40846
8,Body Mass Index,0.275685,1.317433
5,Month Value,0.154937,1.167585
7,Age,-0.172451,0.841599
9,Education,-0.234525,0.790946


     if its coefficient is around 0 or if its odds ration is around 1. 
     A feature is not particularly important

     A weight (coefficien) of 0 implies that no matte the feature value, we will multiply it by 0 (in the model)
     For a unit change in the standardized feature the odds increase by multiple equal to the odds ratio (1 = no change)
     
     exp: odds=5:1,  odds_ratio = 2, for a unit change will change the new odds to 10:1
     5:1  *2 =10:1


    Upon reviewing the summary table, it appears that the 'Daily Workload Average' feature is not significant to our model. Additionally, the features of 'Distance to Work' and 'Day of the Week' do not seem to be important either."
    
        They may not necessary to be useless. At this time, they make no differece.

### Backward elimination

    The goal is that we can simplify the model by removing all features which have close to no contribution to the model.

    When we have the p-value, we get rid of all coefficients with p-value>0.05
    
    With sklearn package, if the weight is small enough, it won't make a difference anyway. In other words, if we remove the Daily Work Load Average, Distance to Work, Day of the Week features, the model should not change a lot.

### Testing the model


In [47]:
reg.score(x_test, y_test)

0.75

In [48]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.71221976, 0.28778024],
       [0.58760009, 0.41239991],
       [0.44337438, 0.55662562],
       [0.77903962, 0.22096038],
       [0.08458343, 0.91541657],
       [0.33103371, 0.66896629],
       [0.29792496, 0.70207504],
       [0.12956221, 0.87043779],
       [0.78307821, 0.21692179],
       [0.74708659, 0.25291341],
       [0.49514969, 0.50485031],
       [0.22640297, 0.77359703],
       [0.07030984, 0.92969016],
       [0.73504052, 0.26495948],
       [0.30533085, 0.69466915],
       [0.55035881, 0.44964119],
       [0.55027426, 0.44972574],
       [0.53930442, 0.46069558],
       [0.40117774, 0.59882226],
       [0.05320682, 0.94679318],
       [0.69874615, 0.30125385],
       [0.77903962, 0.22096038],
       [0.41634563, 0.58365437],
       [0.41634563, 0.58365437],
       [0.2412915 , 0.7587085 ],
       [0.74317087, 0.25682913],
       [0.51065194, 0.48934806],
       [0.85703303, 0.14296697],
       [0.19934235, 0.80065765],
       [0.77903962, 0.22096038],
       [0.

In [49]:
predicted_proba.shape

(140, 2)

In [50]:
predicted_proba[:,1]

array([0.28778024, 0.41239991, 0.55662562, 0.22096038, 0.91541657,
       0.66896629, 0.70207504, 0.87043779, 0.21692179, 0.25291341,
       0.50485031, 0.77359703, 0.92969016, 0.26495948, 0.69466915,
       0.44964119, 0.44972574, 0.46069558, 0.59882226, 0.94679318,
       0.30125385, 0.22096038, 0.58365437, 0.58365437, 0.7587085 ,
       0.25682913, 0.48934806, 0.14296697, 0.80065765, 0.22096038,
       0.37028423, 0.68316787, 0.68825755, 0.52694241, 0.22096038,
       0.53492642, 0.22453007, 0.74389237, 0.40329273, 0.60301627,
       0.21343976, 0.45483346, 0.2403088 , 0.4388431 , 0.82622935,
       0.57857132, 0.69461059, 0.28778024, 0.22209028, 0.2061074 ,
       0.57577123, 0.36438663, 0.66896629, 0.27128561, 0.83334736,
       0.43399232, 0.88600663, 0.23396355, 0.37170685, 0.38209505,
       0.69796139, 0.65909803, 0.29392197, 0.79686146, 0.20956093,
       0.2699923 , 0.10399887, 0.22453007, 0.73944244, 0.30081832,
       0.22453007, 0.32688766, 0.90337554, 0.45745729, 0.59997

### Save the model

In [51]:
import pickle

In [52]:
# wb = write bytes, rb= read bytes
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [53]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)