# Creating a logistic regression to predict absenteeism

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np

## Loading the data

In [2]:
data_preprocessed = pd.read_csv('preprocessed.csv')

In [3]:
data_preprocessed = data_preprocessed.drop(['Unnamed: 0'], axis=1)
data_preprocessed.head(5)

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,1,0,0,0,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,1,0,0,0,7,2,179,51,38,239.554,31,0,0,0,2
3,1,1,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,1,0,0,0,7,3,289,36,33,239.554,30,0,2,1,2


In [4]:
# Reason_type1, Reason_type2, Reason_type3, Reason_type4,
# Distance to work, Daily Workload Average, Children, Pets
# are the reason for absenteeism to work

## Creating the targets

In [5]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [6]:
# Classes
# Moderately absent (<= 3 hours): Assign 0 for absent time less than 3 hours
# Excessively absent (>= 4 hours) : Assign 0 for absent time greater than  hours

# Absenteeism time greater than 3, return 1 otherwise return 0
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head(10)

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,1,0,0,0,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,1,0,0,0,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,1,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,1,0,0,0,7,3,289,36,33,239.554,30,0,2,1,2,0
5,1,0,0,0,7,4,179,51,38,239.554,31,0,0,0,2,0
6,1,0,0,0,7,4,361,52,28,239.554,27,0,1,4,8,1
7,1,0,0,0,7,4,260,50,36,239.554,23,0,4,0,4,1
8,1,0,0,1,7,0,155,12,34,239.554,25,0,2,0,40,1
9,1,0,0,0,7,0,235,11,37,239.554,29,1,1,1,8,1


## Select the inputs for the regression

In [8]:
targets.sum()

319

In [9]:
targets.shape[0]

700

In [10]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [11]:
data_preprocessed.columns

Index(['reason_type1', 'reason_type2', 'reason_type3', 'reason_type4',
       'Month Value', 'Day of the Week', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Education', 'Children', 'Pets', 'Absenteeism Time in Hours',
       'Excessive Absenteeism'],
      dtype='object')

In [12]:
 data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week',
       'Distance to Work', 'Daily Work Load Average'], axis=1)

In [13]:
data_with_targets.head(10)

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,1,0,0,0,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,1,0,0,0,7,179,38,31,0,0,0,0
3,1,1,0,0,7,279,39,24,0,2,0,1
4,1,0,0,0,7,289,33,30,0,2,1,0
5,1,0,0,0,7,179,38,31,0,0,0,0
6,1,0,0,0,7,361,28,27,0,1,4,1
7,1,0,0,0,7,260,36,23,0,4,0,1
8,1,0,0,1,7,155,34,25,0,2,0,1
9,1,0,0,0,7,235,37,29,1,1,1,1


## Next checkpoint

In [14]:
data_with_targets is data_preprocessed

False

## Selecting inputs for the regression

In [15]:
data_with_targets.shape

(700, 12)

In [16]:
## Selecting inputs for our model except the 'Excessive absenteeism'
## DateFrame.iloc[row indices, column indices]
## iloc excludes the last index

data_with_targets.iloc[:, :14]

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,1,0,0,0,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,1,0,0,0,7,179,38,31,0,0,0,0
3,1,1,0,0,7,279,39,24,0,2,0,1
4,1,0,0,0,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,1,0,0,5,179,40,22,1,2,0,1
696,1,1,0,0,5,225,28,24,0,1,2,0
697,1,1,0,0,5,330,28,25,1,0,0,1
698,1,0,0,0,5,235,32,25,1,0,0,0


In [17]:
data_with_targets.iloc[:, :-1]

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,1,0,0,0,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,1,0,0,0,7,179,38,31,0,0,0
3,1,1,0,0,7,279,39,24,0,2,0
4,1,0,0,0,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,1,0,0,5,179,40,22,1,2,0
696,1,1,0,0,5,225,28,24,0,1,2
697,1,1,0,0,5,330,28,25,1,0,0
698,1,0,0,0,5,235,32,25,1,0,0


In [18]:
unscaled_inputs = data_with_targets.iloc[:, :-1]
unscaled_inputs

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,1,0,0,0,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,1,0,0,0,7,179,38,31,0,0,0
3,1,1,0,0,7,279,39,24,0,2,0
4,1,0,0,0,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,1,0,0,5,179,40,22,1,2,0
696,1,1,0,0,5,225,28,24,0,1,2
697,1,1,0,0,5,330,28,25,1,0,0
698,1,0,0,0,5,235,32,25,1,0,0


## Standardizing the data

In [19]:
# absenteeism_scaler will be used to subtract the mean and 
# divide by the standard deviation variablewise(featurewise)

# from sklearn.preprocessing import StandardScaler
# absenteeism_scaler = StandardScaler()

In [20]:
# Standard scaler stanadardizes all the features including the 
# dummies. So we use customscaler for standardizing the particular
# column 

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):#, copy=True, with_mean = True, with_std = True):
        self.scaler = StandardScaler()#(copy, with_mean, with_std)
        self.columns = columns
        self.mean = None
        self.mean = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [21]:
unscaled_inputs.columns

Index(['reason_type1', 'reason_type2', 'reason_type3', 'reason_type4',
       'Month Value', 'Transportation Expense', 'Age', 'Body Mass Index',
       'Education', 'Children', 'Pets'],
      dtype='object')

In [22]:
# columns_to_scale = ['Month Value', 'Day of the Week', 'Transportation Expense',
#        'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']
columns_to_omit = ['reason_type1', 'reason_type2', 'reason_type3', 'reason_type4']

In [23]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [24]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [25]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Education', 'Children', 'Pets'])

In [26]:
# reason_type1,2,3&4 are untouched and not standardized since these 
# are dummies

scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,reason_type1,reason_type2,reason_type3,reason_type4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,1,0,0,0,0.182726,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690
2,1,0,0,0,0.182726,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690
3,1,1,0,0,0.182726,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690
4,1,0,0,0,0.182726,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,1,0,0,-0.388293,-0.654143,0.562059,-1.114186,2.232242,0.880469,-0.589690
696,1,1,0,0,-0.388293,0.040034,-1.320435,-0.643782,-0.447980,-0.019280,1.126663
697,1,1,0,0,-0.388293,1.624567,-1.320435,-0.408580,2.232242,-0.919030,-0.589690
698,1,0,0,0,-0.388293,0.190942,-0.692937,-0.408580,2.232242,-0.919030,-0.589690


In [27]:
scaled_inputs.shape    

(700, 11)

## Splitting the data into train, test and shuffle

### importing relevant module 

In [28]:
from sklearn.model_selection import train_test_split

### Splitting data

In [29]:
train_test_split(scaled_inputs, targets)

[     reason_type1  reason_type2  reason_type3  reason_type4  Month Value  \
 307             1             0             0             0     1.039256   
 419             1             0             0             0    -0.673803   
 70              1             0             0             0     1.039256   
 413             1             0             0             0    -0.673803   
 395             1             0             0             0    -0.959313   
 ..            ...           ...           ...           ...          ...   
 270             1             1             0             0     0.753746   
 684             1             0             0             0    -0.388293   
 392             1             0             0             0    -0.959313   
 249             1             0             0             1     0.182726   
 502             1             0             0             0     0.753746   
 
      Transportation Expense       Age  Body Mass Index  Education  Childr

In [30]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8)

In [31]:
# 80% of the data for testing
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [32]:
# 20% of the data for testing
print(x_test.shape, y_test.shape)

(140, 11) (140,)


## Logistic regression with sklearn

In [33]:
from sklearn.linear_model import LogisticRegression

### Training the model

In [34]:
reg = LogisticRegression()

In [35]:
reg.fit(x_train, y_train)

LogisticRegression()

In [36]:
reg.score(x_train, y_train)

0.7857142857142857

### Manually checking the accuracy

In [37]:
model_outputs = reg.predict(x_train)
model_outputs

array([1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,

In [38]:
y_train

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,

In [39]:
model_outputs == y_train

array([False,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False, False,  True,  True,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True,  True, False, False,  True, False, False,  True,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True, False,  True,  True, False,  True,  True,  True, False,
        True, False,  True,  True,  True,  True, False, False,  True,
        True,  True, False,  True,  True, False,  True,  True,  True,
        True,  True, False,  True,  True, False,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
       False, False, False,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False,

In [40]:
# number of correct predictions (true entires)
np.sum(model_outputs == y_train)

440

In [41]:
# accuracy = correct predictions / number of observations

np.sum(model_outputs == y_train) / model_outputs.shape[0]

0.7857142857142857

### Finding the intercept and coefficients

***For bais/intercept whichever weights is bigger, its corresponding feature is important***

In [42]:
reg.intercept_

array([-3.21565388])

In [43]:
reg.coef_

array([[ 2.51037721,  2.0329187 ,  0.29523719,  2.24160749,  0.08608655,
         0.66437728, -0.2432369 ,  0.32774313,  0.02451868,  0.48161556,
        -0.31048257]])

In [44]:
unscaled_inputs.columns.values

array(['reason_type1', 'reason_type2', 'reason_type3', 'reason_type4',
       'Month Value', 'Transportation Expense', 'Age', 'Body Mass Index',
       'Education', 'Children', 'Pets'], dtype=object)

In [45]:
feature_name = unscaled_inputs.columns.values

In [46]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_name)

summary_table['coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,coefficient
0,reason_type1,2.510377
1,reason_type2,2.032919
2,reason_type3,0.295237
3,reason_type4,2.241607
4,Month Value,0.086087
5,Transportation Expense,0.664377
6,Age,-0.243237
7,Body Mass Index,0.327743
8,Education,0.024519
9,Children,0.481616


In [47]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['intercept', reg.intercept_[0]]
summary_table

Unnamed: 0,Feature name,coefficient
1,reason_type1,2.510377
2,reason_type2,2.032919
3,reason_type3,0.295237
4,reason_type4,2.241607
5,Month Value,0.086087
6,Transportation Expense,0.664377
7,Age,-0.243237
8,Body Mass Index,0.327743
9,Education,0.024519
10,Children,0.481616


### Interpreting the coefficients

In [48]:
summary_table['Odds_ratio'] = np.exp(summary_table.coefficient)
summary_table.head(5)

Unnamed: 0,Feature name,coefficient,Odds_ratio
1,reason_type1,2.510377,12.309572
2,reason_type2,2.032919,7.636342
3,reason_type3,0.295237,1.343445
4,reason_type4,2.241607,9.408443
5,Month Value,0.086087,1.089901


* **reason_1 : Various diseases**
* **reason_2 : Pregnancy and giving birth**
* **reason_3 : Posioning**
* **reason_4 ; light diseases**

In [49]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,coefficient,Odds_ratio
1,reason_type1,2.510377,12.309572
4,reason_type4,2.241607,9.408443
2,reason_type2,2.032919,7.636342
6,Transportation Expense,0.664377,1.94328
10,Children,0.481616,1.618687
8,Body Mass Index,0.327743,1.387832
3,reason_type3,0.295237,1.343445
5,Month Value,0.086087,1.089901
9,Education,0.024519,1.024822
7,Age,-0.243237,0.784086


* If the coefficient is around 0 or if the odds_ratio is around 1, this means, the feature is not particularly important.
* A weight(coefficient) of 0 implies that no matter the feature value, we will multiply it by 0(in the model)
* In terms of odds_ratio, for a unit change in the standardized feature, the odds the odds increase by a multiple equal to the odds ratio(1=no change)
    - Odds x odds_ratio = new odds
    - For example: 
        - **5:1 x 2 = 10:1**
        - **5:1 x 0.2 = 1:1**
            

 * 'Transportation expense' is one of our standardized variables
 * Standardized models (almost) always yield higher accuracy
 *  The intercept or the BIAS 'calibrates' the model

## Backward Elimination

* Simplifying our model by removing all features which have close to no contribution to the model
* When we have the p-values, we get rid of all the co-efficients with p-values > 0.5

## Testing the model

#### Testing our model with test data

In [51]:
reg.score(x_test, y_test)

0.7642857142857142

So based on data that the model has NEVER seen before, in 76.4% of the cases,the model will predict [correctly] if the person is going to be excessively absent.
**Test accuracy is always less than train accuracy by definition**

Often the test accuracy is 10-20% lower than train accuracy(due to overfitting). Instead of 0 and 1, we can get the probability, of an output being 0 or 1.

**Next Method**

In [55]:
# Returns the probability estimates for all possible outputs(classes)
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.06601918, 0.93398082],
       [0.55568447, 0.44431553],
       [0.78141336, 0.21858664],
       [0.72267376, 0.27732624],
       [0.31576289, 0.68423711],
       [0.26510435, 0.73489565],
       [0.45933977, 0.54066023],
       [0.08277948, 0.91722052],
       [0.7365544 , 0.2634456 ],
       [0.73519823, 0.26480177],
       [0.69519712, 0.30480288],
       [0.77718616, 0.22281384],
       [0.13778246, 0.86221754],
       [0.86152043, 0.13847957],
       [0.29790144, 0.70209856],
       [0.85914932, 0.14085068],
       [0.73175748, 0.26824252],
       [0.45918418, 0.54081582],
       [0.82601039, 0.17398961],
       [0.79871595, 0.20128405],
       [0.68996413, 0.31003587],
       [0.21066178, 0.78933822],
       [0.31425495, 0.68574505],
       [0.74598143, 0.25401857],
       [0.24416488, 0.75583512],
       [0.22998631, 0.77001369],
       [0.98628458, 0.01371542],
       [0.36011269, 0.63988731],
       [0.31776473, 0.68223527],
       [0.51903937, 0.48096063],
       [0.

In [53]:
predicted_proba.shape

(140, 2)

Total observations = **140** and **2** columns
* The first column = probability of being **0**
* The second column = probability of being **1**

***Note: The sum of each corresponding column is equal to 1. E.g: 0.06601918 + 0.93398082 = 1, 0.55568447 + 0.44431553 = 1***

In [57]:
# value of second column
predicted_proba[:,1]

array([0.93398082, 0.44431553, 0.21858664, 0.27732624, 0.68423711,
       0.73489565, 0.54066023, 0.91722052, 0.2634456 , 0.26480177,
       0.30480288, 0.22281384, 0.86221754, 0.13847957, 0.70209856,
       0.14085068, 0.26824252, 0.54081582, 0.17398961, 0.20128405,
       0.31003587, 0.78933822, 0.68574505, 0.25401857, 0.75583512,
       0.77001369, 0.01371542, 0.63988731, 0.68223527, 0.48096063,
       0.94940271, 0.79799417, 0.15063085, 0.95171259, 0.02505259,
       0.26004451, 0.09586227, 0.69101736, 0.22281384, 0.70721365,
       0.137486  , 0.17503742, 0.22281384, 0.26004451, 0.30782829,
       0.98183146, 0.09560789, 0.24481675, 0.31003587, 0.29745446,
       0.71662319, 0.72158779, 0.42512943, 0.67148434, 0.28227904,
       0.6660398 , 0.12662498, 0.30480288, 0.30480288, 0.25534303,
       0.63559974, 0.87882906, 0.16250578, 0.5979252 , 0.16935577,
       0.77643789, 0.50701148, 0.4876197 , 0.26004451, 0.15805153,
       0.25534303, 0.24030118, 0.24756751, 0.91906766, 0.72224

In reality, logistic regression models calculate these probabilities in the background. If the probability is:
* below 0.5, it places a 0
* above 0.5, it places a 1

## Saving the model

**We'll use pickle(module) to save the following detail in a model**
* Logistic regression
* coefficient values
* intercept values
* random state values

*Pickle is a python modules used to convert a Python Object into a character stream*

In [58]:
import pickle

In [59]:
with open('model', 'wb') as file: # converting into a binary data
    pickle.dump(reg, file)  # pickling data into a file

We need to pickle the absenteeism scaler too. It was used to standardize all the numerical variables. **The information in the absenteeism_scaler is used to preprocess new data**

In [60]:

with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)
