In [1]:
#import all libraries needed for the module
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

#create the customer scaler class
class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy = True, with_mean = True, with_std = True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean = None
        self.var = None
        
    def fit(self, X, y = None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.array(np.mean(X[self.columns]))
        self.var_ = np.array(np.var(X[self.columns]))
        return self
    
    def transform(self, X, y = None, copy = None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]
    
#create the absenteeism model class that will be used on new data for predictions
class absenteeism_model():
    
    #This function accepts the model and scaler file arguments and unpickles them
    def __init__(self, model_file, scaler_file):
        with open("model", "rb") as model_file, open("scaler", "rb") as scaler_file:
            self.reg = pickle.load(model_file)
            self.scaler = pickle.load(scaler_file)
            self.data = None
    
    #This function applies the same preprocessing to the new data as done in the training data
    def load_and_clean_data(self, data_file):
        
        #load the new data file into a dataframe
        df = pd.read_csv(data_file, delimiter = ",")
        #create a copy of the new dataframe
        self.df_with_predictions = df.copy()
        #drop the "ID" column from dataframe
        df = df.drop(["ID"], axis = 1)
        df["Absenteeism Time in Hours"] = "NaN"
        
        #Create dummy variables in a separate dataframe and drop the first dummy variable
        reason_columns = pd.get_dummies(df["Reason for Absence"], drop_first = True)
        
        #split and categorize the variables into 4 types based on data dictionary
        reason_type_1 = reason_columns.loc[:, 1:14].max(axis = 1)
        reason_type_2 = reason_columns.loc[:, 15:17].max(axis = 1)
        reason_type_3 = reason_columns.loc[:, 18:21].max(axis = 1)
        reason_type_4 = reason_columns.loc[:, 22:].max(axis = 1)
        
        #drop the "Reason for Absence" column to avoid multicollinearity
        df = df.drop(["Reason for Absence"], axis = 1)
        
        #concatenate the dataframe and the dummy variables
        df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
        
        #assign names to the new columns and reorder them
        column_names = ["Date", "Transportation Expense", "Distance to Work", "Age", 
                       "Daily Work Load Average", "Body Mass Index", "Education", "Children",
                       "Pet", "Absenteeism Time in Hours", "Reason_1", "Reason_2", "Reason_3", "Reason_4"]
        
        df.columns = column_names
        
        column_names_reordered = ["Reason_1", "Reason_2", "Reason_3", "Reason_4", "Date", "Transportation Expense", 
                                  "Distance to Work", "Age", "Daily Work Load Average", "Body Mass Index", 
                                  "Education", "Children", "Pet", "Absenteeism Time in Hours"]
        
        df = df[column_names_reordered]
        
        #convert the "Date" column into datetime
        df["Date"] = pd.to_datetime(df["Date"], format = "%d/%m/%Y")
        
        #create a list with month values retrieved from the "Date" column
        list_months = []
        for i in range(df.shape[0]):
            list_months.append(df["Date"][i].month)
            
        #insert the month values into a new column named "Month Value"
        df["Month Value"] = list_months
        
        #create a new feature named "Day of the Week"
        df["Day of the Week"] = df["Date"].apply(lambda x: x.weekday())
        
        #drop the "Date" column from df
        df = df.drop(["Date"], axis = 1)
        
        #reorder the columns in the dataframe
        column_names_upd = ["Reason_1", "Reason_2", "Reason_3", "Reason_4", "Month Value", "Day of the Week",
                            "Transportation Expense", "Distance to Work", "Age", "Daily Work Load Average", 
                            "Body Mass Index", "Education", "Children", "Pet", "Absenteeism Time in Hours"]
        
        df = df["column_names_upd"]
        
        #map the "Education" variables; the result is a dummy
        df["Education"] = df["Education"].map({1:0, 2:1, 3:1, 4:1})
        
        #Replace the Nan values
        df = df.fillna(value = 0)
        
        #drop the original absentee time
        df = df.drop(["Absenteeism Time in Hours"], axis = 1)
        
        #drop the variables we don't need
        df = df.drop(["Day of the Week", "Daily Work Load Average", "Distance to Work"], axis = 1)
        
        #copying the dataframe to a checkpoint signifying data has been preprocessed
        self.preprocessed_data = df.copy()
        
        #scale the values in the dataframe
        self.data = self.scaler.transform(df)
        
    #this function outputs the probability of a datapoint to be 1
    def predicted_probability(self):
        if (self.data is not None):
            pred = self.reg.predict_proba(self.data)[:, 1]
            return pred
        
    #this function outputs 0 or 1 based on the model
    def predicted_output_category(self):
        if (self.data is not None):
            pred_outputs = self.reg.predict(self.data)
            return pred_outputs
        
    #this function predicts the outputs and probabilities.
    #These are added as new columns "Probability" and "Prediction" to the end of the data
    def predicted_outputs(self):
        if (self.data is not None):
            self.preprocessed_data["Probability"] = self.reg.predict_proba(self.data)[:, 1]
            self.preprocessed_data["Prediction"] = self.reg.predict(self.data)
            return self.preprocessed_data
        
    