## Import libraries

In [28]:
import pandas as pd
import numpy as np


In [29]:
data = pd.read_csv("Absenteeism_preprocessed.csv")
data.head()

Unnamed: 0,Diseases,Pregnancy,Poisoning,Light_reasons,Months_values,Day_of_week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Target

In [30]:
# it is a classification problem and the target has more values 
data["Absenteeism Time in Hours"].value_counts() 

# so it must have 0 or 1 and i will take naive approach to class these values (Median)

8      195
2      149
3      106
1       87
4       57
0       39
16      18
24      15
5        7
40       7
32       6
64       3
56       2
80       2
120      2
112      2
7        1
104      1
48       1
Name: Absenteeism Time in Hours, dtype: int64

In [31]:
data["Absenteeism Time in Hours"].mean()

6.761428571428572

In [32]:
target = np.where(data["Absenteeism Time in Hours"] > data["Absenteeism Time in Hours"].median(), 1, 0)
target

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [35]:
data["Excessive_absenteeism"] = target
data = data.drop(["Absenteeism Time in Hours",'Day_of_week','Daily Work Load Average','Distance to Work'], axis=1)
data.head()

Unnamed: 0,Diseases,Pregnancy,Poisoning,Light_reasons,Months_values,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive_absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [36]:
# checking the balance of the data(thats why median work fine to class this target if it was mean it wont be balanced at all)
target.sum() / target.shape[0]

0.45571428571428574

# Select the inputs and the target

In [37]:
# the target in a variable called target before so no need
inputs = data.iloc[:, :-1]


### Standardize the inputs

In [38]:
# from sklearn.preprocessing import StandardScaler
# scalar = StandardScaler()

# scalar.fit(inputs)
# scaled_inputs = scalar.transform(inputs)

In [39]:
# scaled_inputs.shape

In [45]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler()
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [46]:
inputs.columns.values

array(['Diseases', 'Pregnancy', 'Poisoning', 'Light_reasons',
       'Months_values', 'Transportation Expense', 'Age',
       'Body Mass Index', 'Education', 'Children', 'Pets'], dtype=object)

In [47]:
# select the columns to omit(remove all dummy feature so not to be scaled)
columns_to_omit = ['Diseases', 'Pregnancy', 'Poisoning', 'Light_reasons','Education']

 #use list comprehension to iterate over the list
columns_to_scale = [x for x in inputs.columns.values if x not in columns_to_omit]

In [48]:
columns_to_scale

['Months_values',
 'Transportation Expense',
 'Age',
 'Body Mass Index',
 'Children',
 'Pets']

In [50]:
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)
absenteeism_scaler.fit(inputs)
scaled_inputs = absenteeism_scaler.transform(inputs)


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


### Split the data into train and test and shuffle the data

In [51]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, target, train_size=0.8, random_state=1)


print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(560, 11) (560,)
(140, 11) (140,)


# The algorithm (model)

In [52]:
# import libraries
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
reg = LogisticRegression()

# the model
reg.fit(x_train, y_train)
reg.score(x_train, y_train)

0.7732142857142857

### Manually checking the Error

In [53]:
# know which inputs that algorithm predict wrong 
model_output = reg.predict(x_train)

differences = model_output == y_train # this will be array that will compare both and get true and false

# loop to get all the wrong outputs
wrong_output = []
for i in range(0, len(differences)):
    if differences[i] == False:
        wrong_output.append(i)
print(wrong_output)

# check these rows manually and see why the algorithm predicted them wrong(Error Analysis)

[2, 9, 14, 19, 24, 36, 39, 53, 60, 61, 63, 67, 78, 79, 83, 85, 87, 88, 89, 90, 91, 101, 107, 111, 113, 117, 121, 127, 128, 138, 139, 141, 151, 154, 155, 157, 158, 161, 172, 173, 175, 179, 180, 181, 189, 190, 198, 200, 206, 215, 223, 224, 225, 227, 236, 242, 243, 245, 254, 255, 257, 270, 276, 281, 287, 294, 311, 312, 321, 323, 324, 328, 334, 337, 338, 341, 347, 356, 357, 358, 361, 363, 364, 369, 379, 382, 384, 396, 400, 407, 409, 410, 414, 418, 429, 436, 437, 440, 442, 450, 451, 462, 463, 467, 468, 473, 474, 475, 479, 486, 501, 502, 509, 518, 519, 520, 522, 534, 535, 540, 541, 543, 545, 547, 550, 552, 557]


### Summary table contains coefficients  

In [54]:
# making summary table
feature_name = inputs.columns.values
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)


# move all indices by 1
summary_table.index = summary_table.index + 1

# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.694377
1,Diseases,2.804044
2,Pregnancy,0.996562
3,Poisoning,3.101896
4,Light_reasons,0.846441
5,Months_values,0.106526
6,Transportation Expense,0.57204
7,Age,-0.251107
8,Body Mass Index,0.284989
9,Education,-0.045855


### Interpreting the coefficients

the odds_ratio it is the y_hat of logistic regression, if a feature has coefficient around o and if its odd ratio is around 1,

 then this feature not important for the algorithm and better to drop off.

In [55]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

# sort the table according to odds ratio
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Poisoning,3.101896,22.240082
1,Diseases,2.804044,16.511285
2,Pregnancy,0.996562,2.708951
4,Light_reasons,0.846441,2.331334
6,Transportation Expense,0.57204,1.771878
10,Children,0.465138,1.592234
8,Body Mass Index,0.284989,1.329747
5,Months_values,0.106526,1.112406
9,Education,-0.045855,0.95518
7,Age,-0.251107,0.777939


# Testing the model 

In [56]:
reg.score(x_test, y_test)

0.75

In [57]:
# getting the probability of each row
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.26316067, 0.73683933],
       [0.75352471, 0.24647529],
       [0.75263033, 0.24736967],
       [0.8504347 , 0.1495653 ],
       [0.56074809, 0.43925191],
       [0.87577811, 0.12422189],
       [0.33003037, 0.66996963],
       [0.15332554, 0.84667446],
       [0.76464709, 0.23535291],
       [0.13406616, 0.86593384],
       [0.70561623, 0.29438377],
       [0.13745303, 0.86254697],
       [0.61342902, 0.38657098],
       [0.32918386, 0.67081614],
       [0.35397769, 0.64602231],
       [0.84266371, 0.15733629],
       [0.76922364, 0.23077636],
       [0.38109653, 0.61890347],
       [0.86179186, 0.13820814],
       [0.79262112, 0.20737888],
       [0.70561623, 0.29438377],
       [0.51066833, 0.48933167],
       [0.44532234, 0.55467766],
       [0.64823281, 0.35176719],
       [0.51842515, 0.48157485],
       [0.83350224, 0.16649776],
       [0.71189416, 0.28810584],
       [0.75912973, 0.24087027],
       [0.87061106, 0.12938894],
       [0.62715666, 0.37284334],
       [0.

# Saving the model 

In [58]:
import pickle
with open("model", "wb") as file:
    pickle.dump(reg, file)

In [59]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)