In [3]:
# importing models
import pandas as pd
import numpy as np 
import pickle
from sklearn.base import BaseEstimator, TransformerMixin

In [5]:
# reading data 
df = pd.read_csv("Absenteeism_preprocessed.csv")
df.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [6]:
# checking the median of hrs of absenteeism
df["Absenteeism Time in Hours"].median()

3.0

In [None]:
# applying  np.where to mask hrs more than the median as 1 and not as 0


In [7]:
targets = np.where(df["Absenteeism Time in Hours"] > df["Absenteeism Time in Hours"].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [16]:
# checking percentage of data with higher hrs of absenteesism
print("precent of more hrs is ", targets.sum()/targets.shape[0])

precent of more hrs is  0.45571428571428574


In [18]:
# drop irrelevant predictors
df_with_targets = df.drop(['Absenteeism Time in Hours','Day of the Week',
                                            'Daily Work Load Average','Distance to Work'],axis=1)
df_with_targets.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1


In [22]:
# add targets to df 
df_with_targets["Level of Absenteesim"] = targets 
df_with_targets.shape
#df_with_targets.head(5)

(700, 12)

In [None]:
df = df_with_targets.copy() # copying the data to avoid modification

In [30]:
# selecting predictors
predictors = df.iloc[:, :11]

In [32]:
# selecting targets
targets = df.iloc[:,-1]
targets.shape 

(700,)

In [35]:
# standardizing datasets (since its not recommended to standardized
# one hot encoders), I create a custom standardizer for this data

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns 
        self.mean_= None 
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    def transform(self, X, y=None, copy=None):
        init_columns = X.columns
        
        #scale colums in definece with the class instance 
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        
        X = pd.concat([X_not_scaled, X_scaled], axis=1)[init_columns]
        
        return X