#### Imports

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

### Main Class
###### Read and Clean Data


In [5]:
class absenteeism_model():
    def __init__(self):
        pass
    
    def load_clean_data(self, data_file):
        df = pd.read_csv(data_file, delimiter=',')
        self.df_with_prediction = df.copy()
        df = df.drop(["ID"], axis = 1)
        
        df['Absenteeism Time in Hours'] = 'NaN'
        
        # Create dataframe that contains dummy values
        # quantitative analysis -> add numeric meaning to our categorical nominal values through dummy variables
        # dummy variable -> an explanatory binary variable that equals to:
        #                   1. if a certain categorical effect is present
        #                   2. if the same effect is absent
        
        reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True) # Convers categorical variable into dummy variables
        
        # split reason_columns into 4 types
        # 1:14  -> group 1 -> Related to diseases
        # 15:17 -> group 2 -> Related to pregnancy
        # 18:21 -> group 3 -> Poisoning
        # 22:   -> group 4 -> Light Reasons for absence
        reason_type_1 = reason_columns.loc[:, 1:14].max(axis=1)
        reason_type_2 = reason_columns.loc[:, 15:17].max(axis=1)
        reason_type_3 = reason_columns.loc[:  18:21].max(axis=1)
        reason_type_4 = reason_columns.loc[:, 22:].max(axis=1)
        
        # To avoid multicollinearity drop "Reason for Absence"
        df = df.drop(['Reason for Absence'], axis = 1)
        
        # Concatenate df and the 4 types of reason for absence
        df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
        
        # Assign names to the 4 reason type columns  
        column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
                        'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
                        'Pet', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']       
        df.columns = column_names
        
        # Reorder the columns in df
        column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense', 
                                  'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education', 
                                  'Children', 'Pet', 'Absenteeism Time in Hours']        
        df = df[column_names_reordered]
        
        # Convert "Date" into datetime
        df['Date'] = pd.to_datetime(df["Date"], format='%d/%m/%Y')
        
        # Create list with Month values
        list_months = []
        for i in range(df.shape[0]):
            list_months.append(df['Date'][i].month)
        
        # Insert value into new column
        df["Month Value"] = list_months
        
        # Create feature day of the week
        df['Day of the Week'] = df['Date'].apply(lambda x: x.weekday())
        
        # Drop the 'Date' column from df
        df = df.drop(['Date'], axis = 1)
        
        # Re-order the columns in df
        column_names_upd = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value', 'Day of the Week',
                            'Transportation Expense', 'Distance to Work', 'Age',
                            'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
                            'Pet', 'Absenteeism Time in Hours']
        
        df = df[column_names_upd]
        
        # Map "Education" variables
        df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})
        
        # Replace the NaN values
        df = df.fillna(value=0)
        
        # Drop irrelevant features
        df = df.drop(['Absenteeism Time in Hours'],axis=1)
        df = df.drop(['Day of the Week','Daily Work Load Average','Distance to Work'],axis=1)
        
        self.preprocessed_data = df.copy()
        
        
        
        
        
        
        print("Finished")
        

model = absenteeism_model()
model.load_clean_data("Absenteeism_data.csv")

Finished
