<a href="https://colab.research.google.com/github/Ayman947/HR-LogisticRegression-PredictingAbsenteeism/blob/main/3_Deployment_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing All Necessary **Libraries**

In [94]:
# importing the relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import itertools
import pickle

# create the Custom Scaler class to only scale predetermined columns

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

### **Unpickling** The Model & Scaler


In [95]:
with open('reg', 'rb') as model_file, open('scaler', 'rb') as scaler_file:
  reg  = pickle.load(model_file)
  scaler = pickle.load(scaler_file)

### Importing The **New Data**

In [96]:
raw_new = pd.read_csv('3_absenteeism_new_data.csv')
Raw_new_data = raw_new.copy()

### Data **Cleansning & Preprocessing**

In [97]:
Raw_new_data = Raw_new_data.drop(['ID'], axis = 1)
reason_columns = pd.get_dummies(Raw_new_data['Reason for Absence'], drop_first = True)
Raw_new_data = Raw_new_data.drop(['Reason for Absence'], axis = 1)
reason_type_1 = reason_columns.loc[:, 1:14].max(axis=1)
reason_type_2 = reason_columns.loc[:, 15:17].max(axis=1)
reason_type_3 = reason_columns.loc[:, 18:21].max(axis=1)
reason_type_4 = reason_columns.loc[:, 22:].max(axis=1)
Raw_new_data = pd.concat([Raw_new_data, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
Raw_new_data.columns = ['Date', 'Transportation Expense', 'Distance to Work', 'Age','Daily Work Load Average', 'Body Mass Index', 'Education','Children', 'Pets', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense', 'Distance to Work', 'Age','Daily Work Load Average', 'Body Mass Index', 'Education','Children', 'Pets']
Raw_new_data = Raw_new_data[column_names_reordered]
Raw_new_data['Reason_2'] = 0
df_reason_mod = Raw_new_data
df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'])    
df_reason_mod['Month'] = df_reason_mod['Date'].apply(lambda x: x.month)
df_reason_mod['Day of the Week'] = df_reason_mod['Date'].apply(lambda x: x.weekday())
column_names_upd = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month', 'Day of the Week','Transportation Expense', 'Distance to Work', 'Age','Daily Work Load Average', 'Body Mass Index', 'Education', 'Children','Pets']
df_reason_date_mod = df_reason_mod[column_names_upd]
df_reason_date_mod['Education'] = df_reason_date_mod['Education'].map({1:0, 2:1, 3:1, 4:1})
data_preprocessed = df_reason_date_mod.copy()
data_preprocessed.drop(['Daily Work Load Average', 'Distance to Work', 'Day of the Week'], axis = 1, inplace=True)
unscaled_inputs = data_preprocessed.iloc[:,:]
scaled_inputs = scaler.transform(unscaled_inputs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### **Predicting**

In [98]:
Raw_new_data['Prediction'] = reg.predict(scaled_inputs)
yhat_prob = reg.predict_proba(scaled_inputs)
Raw_new_data['Probability_No'] = yhat_prob[:,0]
Raw_new_data['Probability_Yes'] = yhat_prob[:,1]

### **Exporting The Predictions**


In [99]:
Raw_new_data.to_csv('Predictions.csv')