## Importing Packages

In [5]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import randint
from sklearn.metrics import make_scorer, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
!pip install xgboost
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
!pip install imblearn
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import get_scorer_names
from sklearn.neural_network import MLPClassifier


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Load data

In [6]:
from google.colab import drive 
drive.mount('/content/gdrive')

train_X =pd.read_csv('gdrive/My Drive/aaw_train_X.csv')
train_y =pd.read_csv('gdrive/My Drive/aaw_train_y.csv')
test_X =pd.read_csv('gdrive/My Drive/aaw_test_X.csv')
test_y =pd.read_csv('gdrive/My Drive/aaw_test_y.csv')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Model the data

In [7]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

## LOGISTIC REGRESSION 

### LOGISTIC REGRESSION - RANDOM SEARCH

In [None]:
score_measure = "accuracy"
kfolds = 5

param_grid = {
    'max_iter' : np.arange(300,1000),
    'penalty': ['none','l1','l2','elasticnet'],
    'solver':['saga','liblinear']
}

logistic_reg = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = logistic_reg, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best accuracy score is 1.0
... with parameters: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 464}


### LOGISTIC REGRESSION - GRID SEARCH

In [None]:
score_measure = "accuracy"
kfolds = 5
max_iter = rand_search.best_params_['max_iter']
penalty = rand_search.best_params_['penalty']
solver = rand_search.best_params_['solver']

param_grid = {  
    'max_iter' : np.arange(max_iter-5,max_iter+5),
    'penalty': [penalty],
    'solver': [solver]
}

logistic_reg = LogisticRegression()
grid_search = GridSearchCV(estimator = logistic_reg, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallLogistic = grid_search.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
The best accuracy score is 1.0
... with parameters: {'max_iter': 460, 'penalty': 'l1', 'solver': 'liblinear'}


In [None]:
c_matrix = confusion_matrix(test_y, grid_search.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Logistic Regression", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## SVM

### SVM CLASSIFIER - LINEAR - RBF - POLY - RANDOM SEARCH

In [None]:
score_measure = "accuracy"
kfolds = 5

param_grid = {
    'C': np.arange(1,20),   
    'gamma': ['scale','auto'],
    'kernel':['linear','rbf','poly']
}

svm = SVC()
rand_search = RandomizedSearchCV(estimator = svm, param_distributions=param_grid, cv=kfolds, n_iter=300,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 114 candidates, totalling 570 fits
The best accuracy score is 0.9984375
... with parameters: {'kernel': 'linear', 'gamma': 'scale', 'C': 1}


### SVM CLASSIFIER - LINEAR - RBF - POLY - GRID SEARCH

In [None]:
score_measure = "accuracy"
kfolds = 5

C = rand_search.best_params_['C']
gamma = rand_search.best_params_['gamma']
kernel = rand_search.best_params_['kernel']

param_grid = {
    'C': np.arange(C-3,C+3),  
    'gamma': [gamma],
    'kernel': [kernel]
    
}

svm = SVC()
grid_search = GridSearchCV(estimator = svm, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallSVM = grid_search.best_estimator_

Fitting 5 folds for each of 6 candidates, totalling 30 fits
The best accuracy score is 0.9984375
... with parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}


In [None]:
c_matrix = confusion_matrix(test_y, grid_search.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"SVM", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## DECISION TREE

### DECISION TREE - RANDOM SEARCH

In [None]:
print(get_scorer_names())

['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_negative_likelihood_ratio', 'neg_root_mean_squared_error', 'normalized_mutual_info_score', 'positive_likelihood_ratio', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weight

In [None]:
score_measure = "accuracy"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,50),  
    'min_samples_leaf': np.arange(1,50),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 50), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best accuracy score is 1.0
... with parameters: {'min_samples_split': 39, 'min_samples_leaf': 49, 'min_impurity_decrease': 0.0026, 'max_leaf_nodes': 17, 'max_depth': 8, 'criterion': 'gini'}


### DECISION TREE - GRID SEARCH

In [None]:
score_measure = "accuracy"
kfolds = 5
min_samples_split = rand_search.best_params_['min_samples_split']
min_samples_leaf = rand_search.best_params_['min_samples_leaf']
min_impurity_decrease = rand_search.best_params_['min_impurity_decrease']
max_leaf_nodes = rand_search.best_params_['max_leaf_nodes']
max_depth = rand_search.best_params_['max_depth']
criterion = rand_search.best_params_['criterion']

param_grid = {
    'min_samples_split': np.arange(min_samples_split-2,min_samples_split+2),  
    'min_samples_leaf': np.arange(min_samples_leaf-2,min_samples_leaf+2),
    'min_impurity_decrease': np.arange(min_impurity_decrease-0.0001, min_impurity_decrease+0.0001, 0.00005),
    'max_leaf_nodes': np.arange(max_leaf_nodes-2,max_leaf_nodes+2), 
    'max_depth': np.arange(max_depth-2,max_depth+2), 
    'criterion': [criterion]
}

dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallTree = grid_search.best_estimator_

Fitting 5 folds for each of 1024 candidates, totalling 5120 fits
The best accuracy score is 1.0
... with parameters: {'criterion': 'gini', 'max_depth': 6, 'max_leaf_nodes': 15, 'min_impurity_decrease': 0.0025, 'min_samples_leaf': 47, 'min_samples_split': 37}


In [None]:
c_matrix = confusion_matrix(test_y, grid_search.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Decision Tree", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## NEURAL NETWORKS

In [8]:
%%time

ann = MLPClassifier(hidden_layer_sizes=(60,50,40), solver='adam', max_iter=200)
_ = ann.fit(train_X, train_y)

CPU times: user 967 ms, sys: 940 ms, total: 1.91 s
Wall time: 1.95 s


In [9]:
%%time
y_pred = ann.predict(test_X)

CPU times: user 3.81 ms, sys: 0 ns, total: 3.81 ms
Wall time: 7.64 ms


In [10]:
print(classification_report(test_y, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       141
           1       0.99      0.98      0.98        81

    accuracy                           0.99       222
   macro avg       0.99      0.98      0.99       222
weighted avg       0.99      0.99      0.99       222



### NEURAL NETWORKS - RANDOM SEARCH

In [14]:
%%time

score_measure = "accuracy"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (50,), (70,),(50,30), (40,20), (60,40, 20), (70,50,40)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0, .2, .5, .7, 1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1, 0.2, 0.5],
    'max_iter': [5000]
}

ann = MLPClassifier()
grid_search = RandomizedSearchCV(estimator = ann, param_distributions=param_grid, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(train_X, train_y)

bestRecallTree = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'solver': 'sgd', 'max_iter': 5000, 'learning_rate_init': 0.01, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (40, 20), 'alpha': 0.2, 'activation': 'logistic'}
CPU times: user 3.82 s, sys: 227 ms, total: 4.04 s
Wall time: 4min 43s


In [15]:
%%time
y_pred = bestRecallTree.predict(test_X)

print(classification_report(test_y, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       141
           1       1.00      0.96      0.98        81

    accuracy                           0.99       222
   macro avg       0.99      0.98      0.99       222
weighted avg       0.99      0.99      0.99       222

CPU times: user 16.8 ms, sys: 742 µs, total: 17.6 ms
Wall time: 18.1 ms


### NEURAL NETWOKS - GRID SEARCH

In [16]:
%%time

score_measure = "accuracy"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (30,), (50,), (70,), (90,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [.5, .7, 1],
    'learning_rate': ['adaptive', 'invscaling'],
    'learning_rate_init': [0.005, 0.01, 0.15],
    'max_iter': [5000]
}

ann = MLPClassifier()
grid_search = GridSearchCV(estimator = ann, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(train_X, train_y)

bestRecallTree = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
{'activation': 'tanh', 'alpha': 0.5, 'hidden_layer_sizes': (30,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.005, 'max_iter': 5000, 'solver': 'adam'}
CPU times: user 2.55 s, sys: 252 ms, total: 2.8 s
Wall time: 2min 36s


In [17]:
%%time
y_pred = bestRecallTree.predict(test_X)

print(classification_report(test_y, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       141
           1       1.00      0.98      0.99        81

    accuracy                           0.99       222
   macro avg       0.99      0.99      0.99       222
weighted avg       0.99      0.99      0.99       222

CPU times: user 14.9 ms, sys: 0 ns, total: 14.9 ms
Wall time: 15.9 ms


In [18]:
c_matrix = confusion_matrix(test_y, grid_search.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"NN_grid", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## PERFORMANCE OF ALL MODELS

In [None]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.990991,1.0,0.975309,0.9875
0,SVM,1.0,1.0,1.0,1.0
0,Decision Tree,1.0,1.0,1.0,1.0


In [None]:
performance.sort_values(by=['Accuracy'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.990991,1.0,0.975309,0.9875
0,SVM,1.0,1.0,1.0,1.0
0,Decision Tree,1.0,1.0,1.0,1.0


In [19]:
# Neural networks performance which is added 
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,NN_grid,0.990991,1.0,0.975309,0.9875


## ANALYSIS OF MODELS 

##Summary and Analysis: 

The problem statement is related to the Human Resources(HR) Analytics where the employee's absence can be validated using various reasons which can potentially impact the weightage for the decision of promotion and  elimination at the company.

Here, the False Negative (where we predict the employee is not absent but in actuality, he is absent) and False Positive (where we predict the employee is absent but in actuality, he is not absent) both of them have the equal priority as it may help in adding weightage at the decision making process for the promotion or elimination in the company.
Hence, The **Accuracy** is considered here as the main parameter for the decision of the best fitting model for the data chosen.

The performance of three distinct machine learning models—Logistic Regression, Support Vector Machine (SVM), and Decision Tree are evaluated.

**Impact from Preprocessing data:**
The 3 predictors are label encoded and 7 predictors are hot encoded leading to many zeroes and ones in the data while processing. The resulted data from pre processing is encoded, splitted, balanced, regularized and early stopped for better training and reduce overfitting while fitting the data set into these models. The regularization and early stopping is introduced for the reduction of over fitting as mentioned in the python cheatsheet from the ppt given in the modules.The data contains 741 rows(limited dataset) and is very well trained and contains more zeros and ones because of the encoding and scaling. Hence the scoring parameters accuracy is high for the model fitted. If the dataset had many entries, then the training and testing can be effective compatred to a limited entry dataset.

Accuracy:
The percentage of properly identified cases relative to all instances is how accuracy is calculated.
The SVM and Decision Tree models both get a perfect score of 1.0, and all three models have exceptional accuracy.

Precision:
The percentage of accurate positive predictions compared to all positive forecasts is known as precision.
All three models have an accuracy score of 1.0, which means that all successful predictions are made.

Recall:
Recall quantifies the percentage of correct positive predictions among all actual positive occurrences.
With the SVM scoring a perfect 1.0, all three models exhibit good recall. 

F1 Score:
The F1 score, which is a balanced indicator of the model's performance, is the harmonic mean of accuracy and recall.
The Decision Tree and Logistic Regression models each received a score of 1.0, while the SVM model also received a score of 1.0, giving all three models strong F1 scores.


Overall, based on their excellent accuracy scores on the provided dataset, we can say that all three models have done quite well. The data is trained good but more analysis and testing may be necessary to determine the real effectiveness of these models.

### ANALYSIS FOR NEURAL NETWORK ADDED:
The three predictors are label encoded and 7 predictors are hot encoded leading to many zeroes and ones in the data while processing. The resulted data from pre processing is encoded, splitted, balanced, regularized and early stopped for better training and reduce overfitting while fitting the data set into these models. The regularization and early stopping is introduced for the reduction of over fitting as mentioned in the python cheatsheet from the ppt given in the modules.The data contains 741 rows(limited dataset) and is very well trained and contains more zeros and ones because of the encoding and scaling. Hence the scoring parameters accuracy is high for the model fitted. 

Hence, The data set fits neural networks also very well and generated maximum of 99% of accuracy almost similar to the other models and there is no much drastic difference observed because of the type and size of the dataset chosen which has trained well. Neural network also have good f1 score which is 0.98 and good recall score of 0.97. If the dataset had many entries, then the training and testing can be effective compared to a limited entry dataset.