In [1]:
import pandas as pd 
import numpy as np 
import ml 

In [2]:
cleaned_data = ml.file2df(file_ = "Absenteeism_data_cleaned.csv")
safe_copy = cleaned_data.copy() 
cleaned_data.head(5)

Unnamed: 0,transp exp,distance to work,age,avg work load,bmi,education,kids,pets,absence/h,reason_1,reason_2,reason_3,reason_4,weekday,month
0,289,36,33,239.554,30,0,2,1,4,0,0,0,1,1,7
1,118,13,50,239.554,31,0,1,0,0,0,0,0,0,1,7
2,179,51,38,239.554,31,0,0,0,2,0,0,0,1,2,7
3,279,5,39,239.554,24,0,2,0,4,1,0,0,0,3,7
4,289,36,33,239.554,30,0,2,1,2,0,0,0,1,3,7


## Classification of ABSENTEEISM
In this example, the study of the data and further prediction would be based in the MEDIAN value of *absence hours*.

        1. If a person is EQUAL OR ABOVE the MEDIAN it would be consider excessively absent.
        2. If a person is UNDER the MEDIAN, it would be consider moderately absent.

For making such classification, we first need to calculate the MEDIAN value, and assign a value to each record of the df

In [3]:
cleaned_data["target"] = ml.target(cleaned_data["absence/h"], cleaned_data["absence/h"].median())
cleaned_data.drop(["absence/h"], axis = 1, inplace = True)
cleaned_data_until_target = cleaned_data.copy()
cleaned_data.head()

Unnamed: 0,transp exp,distance to work,age,avg work load,bmi,education,kids,pets,reason_1,reason_2,reason_3,reason_4,weekday,month,target
0,289,36,33,239.554,30,0,2,1,0,0,0,1,1,7,1
1,118,13,50,239.554,31,0,1,0,0,0,0,0,1,7,0
2,179,51,38,239.554,31,0,0,0,0,0,0,1,2,7,0
3,279,5,39,239.554,24,0,2,0,1,0,0,0,3,7,1
4,289,36,33,239.554,30,0,2,1,0,0,0,1,3,7,0


In [4]:
X = cleaned_data.iloc[:, :-1]
y = cleaned_data.iloc[:, -1]

## Standardization of data
When data are dispersed in dimensions, we need to run an **Standardization-Step** prior to the ML process. WE also NEED to exclude the categorical features, previously transformed with .get_dummies() from the standardization thats why we divided the features into categorical and numerical.

In [5]:
categorical = ["education", "reason_1", "reason_2", "reason_3", "reason_4"]
numerical = ["transp exp", "distance to work", "age", "avg work load", "bmi", "kids", "pets", "weekday", "month"]

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_numeric_std = pd.DataFrame(data = scaler.fit_transform(X[numerical]), columns = numerical)
X_scaled = pd.merge(X_numeric_std, X[categorical], left_index = True, right_index = True)
X_scaled.head()

Unnamed: 0,transp exp,distance to work,age,avg work load,bmi,kids,pets,weekday,month,education,reason_1,reason_2,reason_3,reason_4
0,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487,-0.683704,0.182726,0,0,0,0,1
1,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.01928,-0.58969,-0.683704,0.182726,0,0,0,0,0
2,-0.654143,1.426749,0.24831,-0.806331,1.002633,-0.91903,-0.58969,-0.007725,0.182726,0,0,0,0,1
3,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0.880469,-0.58969,0.668253,0.182726,0,1,0,0,0
4,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487,0.668253,0.182726,0,0,0,0,1


## Train/Test spliting of data

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.20, random_state = 42)

## Machine Learning model generation and evaluation

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [9]:
model = LogisticRegression(random_state = 42, n_jobs = -1)
model.fit(X_train, y_train)
print("Our model has an accuracy of", round(model.score(X_train, y_train) * 100, 2), "%")

Our model has an accuracy of 76.79 %


### Summary information about the ML model

In [10]:
model_summary = pd.DataFrame()
model_summary["variable"] = X.columns.values
model_summary["coef"] = np.transpose(model.coef_)
model_summary.index = model_summary.index + 1
model_summary.loc[0] = ["Intercept", model.intercept_[0]]
model_summary["odds ratio"] = np.exp(model_summary.coef)
model_summary = model_summary.sort_index()
model_summary = model_summary.sort_values(by = "odds ratio", axis = 0, ascending = False)
model_summary

Unnamed: 0,variable,coef,odds ratio
13,weekday,3.071973,21.584445
11,reason_3,2.928834,18.70581
14,month,0.994322,2.70289
12,reason_4,0.732911,2.08113
1,transp exp,0.674526,1.963103
6,education,0.41818,1.519193
5,bmi,0.24536,1.278081
9,reason_1,0.079737,1.083002
4,avg work load,-0.020752,0.979462
2,distance to work,-0.056784,0.944798


## Backward elimination
Coefficients with values close to 0 do not contribute much to the ML model, so can be eliminate in a process called **backward elimination**. In this case there are 3 features with values < 0.10:

        1. avg work load
        2. distance to work
        3. pets

For that, we go back to the **cleaned_data_until_target** safe copy, redo the standardization, eliminate the 3 features and then recalculate the ML model and it's parameters.        

In [11]:
mod_data = cleaned_data_until_target.drop(["avg work load", "distance to work", "pets"], axis = 1)
X_mod = mod_data.iloc[:, :-1]
y_mod = mod_data.iloc[:, -1]

In [12]:
categorical = ["education", "reason_1", "reason_2", "reason_3", "reason_4"]
mod_numerical = ["transp exp", "age", "bmi", "kids", "weekday", "month"]

X_numeric_std = pd.DataFrame(data = scaler.fit_transform(X[numerical]), columns = numerical) #using same scaler than before
X_scaled_mod = pd.merge(X_numeric_std, X[categorical], left_index = True, right_index = True)
X_scaled_mod.head()

Unnamed: 0,transp exp,distance to work,age,avg work load,bmi,kids,pets,weekday,month,education,reason_1,reason_2,reason_3,reason_4
0,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487,-0.683704,0.182726,0,0,0,0,1
1,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.01928,-0.58969,-0.683704,0.182726,0,0,0,0,0
2,-0.654143,1.426749,0.24831,-0.806331,1.002633,-0.91903,-0.58969,-0.007725,0.182726,0,0,0,0,1
3,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0.880469,-0.58969,0.668253,0.182726,0,1,0,0,0
4,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487,0.668253,0.182726,0,0,0,0,1


In [13]:
X_train_mod, X_test_mod, y_train_mod, y_test_mod = train_test_split(X_scaled_mod, y_mod, test_size = 0.20, random_state = 42)

In [14]:
model_mod = LogisticRegression(random_state = 42, n_jobs = -1)
model_mod.fit(X_train_mod, y_train_mod)
print("Our modiffied model has an accuracy of", round(model_mod.score(X_train_mod, y_train_mod) * 100, 2), "%")

Our modiffied model has an accuracy of 76.79 %


### Summary information about the modified ML model

In [16]:
model_summary = pd.DataFrame()
model_summary["variable"] = X_train_mod.columns.values
model_summary["coef"] = np.transpose(model_mod.coef_)
model_summary.index = model_summary.index + 1
model_summary.loc[0] = ["Intercept", model_mod.intercept_[0]]
model_summary["odds ratio"] = np.exp(model_summary.coef)
model_summary = model_summary.sort_index()
model_summary = model_summary.sort_values(by = "odds ratio", axis = 0, ascending = False)
model_summary

Unnamed: 0,variable,coef,odds ratio
13,reason_3,3.071973,21.584445
11,reason_1,2.928834,18.70581
14,reason_4,0.994322,2.70289
12,reason_2,0.732911,2.08113
1,transp exp,0.674526,1.963103
6,kids,0.41818,1.519193
5,bmi,0.24536,1.278081
9,month,0.079737,1.083002
4,avg work load,-0.020752,0.979462
2,distance to work,-0.056784,0.944798


### Testing of the ML modified model with the test_data (X_test, y_test)

In [17]:
score = model_mod.score(X_test_mod, y_test_mod)
print("Our modiffied model has an accuracy of", round(score * 100, 2), "%")

Our modiffied model has an accuracy of 77.86 %


## Saving the BEST model
In this case the best model is the modified model, so that is the one that we are going to save for later. We will save it at a pickle file together with the scaler method we used for the numerical data, so if we have to use this ML model with further data, we can proceed with the same data treatment.

In [19]:
import pickle

with open("model_mod", "wb") as file:
    pickle.dump(model_mod, file) #saving the modified model at the current directory

with open("scaler", "wb") as file:
    pickle.dump(scaler, file) #saving the scaler at the current directory