#### Imports

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

### Main Class
###### Read and Clean Data


In [5]:
class absenteeism_model():
    def __init__(self):
        pass
    
    def load_clean_data(self, data_file):
        df = pd.read_csv(data_file, delimiter=',')
        self.df_with_prediction = df.copy()
        df = df.drop(["ID"], axis = 1)
        
        df['Absenteeism Time in Hours'] = 'NaN'
        
        # Create dataframe that contains dummy values
        # quantitative analysis -> add numeric meaning to our categorical nominal values through dummy variables
        # dummy variable -> an explanatory binary variable that equals to:
        #                   1. if a certain categorical effect is present
        #                   2. if the same effect is absent
        
        reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True) # Convers categorical variable into dummy variables
        
        # split reason_columns into 4 types
        # 1:14  -> group 1 -> Related to diseases
        # 15:17 -> group 2 -> Related to pregnancy
        # 18:21 -> group 3 -> Poisoning
        # 22:   -> group 4 -> Light Reasons for absence
        
        # We are using max to substitute an entire row of single values to a single one
        # 0 -> If none of the values on the given row were equal to 1 -> Reason for absence was not from this group of reasons
        # 1 -> Somewhere among these 14 columns we have observed the number 1 -> Reason for absence was from this group of reasons
        reason_type_1 = reason_columns.loc[:, 1:14].max(axis=1)
        reason_type_2 = reason_columns.loc[:, 15:17].max(axis=1)
        reason_type_3 = reason_columns.loc[:,  18:21].max(axis=1)
        reason_type_4 = reason_columns.loc[:, 22:].max(axis=1)
        
        # To avoid multicollinearity drop "Reason for Absence"
        df = df.drop(['Reason for Absence'], axis = 1)
        
        # Concatenate df and the 4 types of reason for absence
        df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
        #print(df)
        
        # Assign names to the 4 reason type columns  
        '''
        column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
                        'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
                        'Pet', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
        '''
        tmp = np.array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4'],dtype=object)     
        column_names = df.columns.values[:-4]
        column_names = np.append(column_names, tmp)
        df.columns = column_names
        #print(df)
        
        # Reorder the columns in df
        '''
        column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense', 
                                  'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education', 
                                  'Children', 'Pet', 'Absenteeism Time in Hours']
        '''
        wo_reasons = column_names[:-4]
        only_reasons = column_names[len(wo_reasons):]
        column_names_reordered =  np.append(only_reasons, wo_reasons)       
        df = df[column_names_reordered]
        print("Column Names Reordered")
        print(df.head())
        
        # Create Checkpoint
        # In programming creating chekpoints refers to storing the current version of your code
        # Create a temporary save of your work so that you reduce the risk of losing important data at a later stage.
        df_reason_mod = df.copy()
        
        # Convert "Date" into datetime from string as it is now
        # timestamp -> classical data type found in many programming languages out there, used for values representing dates and time
        # When doing this conversion, you must always specify the proper format of the date values you will be working on
        df['Date'] = pd.to_datetime(df["Date"], format='%d/%m/%Y')
        
        # Create list with Month values
        list_months = []
        for i in range(df.shape[0]):
            list_months.append(df['Date'][i].month)
        
        # Insert value into new column
        df["Month Value"] = list_months
        
        # Create feature day of the week
        df['Day of the Week'] = df['Date'].apply(lambda x: x.weekday())
        
        # Drop the 'Date' column from df
        df = df.drop(['Date'], axis = 1)
        
        # Re-order the columns in df
        column_names_upd = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value', 'Day of the Week',
                            'Transportation Expense', 'Distance to Work', 'Age',
                            'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
                            'Pets', 'Absenteeism Time in Hours']
        
        df = df[column_names_upd]
        
        # Map "Education" variables
        # Few people have an education above highschool -> separate into 2 classes: highschool or higher
        df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})
        df.to_csv('Absenteeism_preprocessed.csv', index=False)
        
        # Replace the NaN values
        df = df.fillna(value=0)
        
        # Drop irrelevant features
        df = df.drop(['Absenteeism Time in Hours'],axis=1)
        df = df.drop(['Day of the Week','Daily Work Load Average','Distance to Work'],axis=1)
        
        self.preprocessed_data = df.copy()
        


model = absenteeism_model()
model.load_clean_data("Absenteeism_data.csv")

Column Names Reordered
   Reason_1  Reason_2  Reason_3  Reason_4        Date  Transportation Expense  \
0         0         0         0         1  07/07/2015                     289   
1         0         0         0         0  14/07/2015                     118   
2         0         0         0         1  15/07/2015                     179   
3         1         0         0         0  16/07/2015                     279   
4         0         0         0         1  23/07/2015                     289   

   Distance to Work  Age  Daily Work Load Average  Body Mass Index  Education  \
0                36   33                  239.554               30          1   
1                13   50                  239.554               31          1   
2                51   38                  239.554               31          1   
3                 5   39                  239.554               24          1   
4                36   33                  239.554               30          1   

   