<a href="https://colab.research.google.com/github/21Ovi/Employee-Performance-Analysis/blob/main/train_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from scipy.stats import skew, kurtosis
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from imblearn.over_sampling import RandomOverSampler,SMOTENC,SMOTE
from sklearn.feature_selection import SelectKBest,chi2
from sklearn import svm, tree, model_selection, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix
from yellowbrick.classifier import ROCAUC

#!pip install optuna
import optuna
import matplotlib

# To avoid warnings
import warnings
warnings.filterwarnings('ignore')

In [75]:
data = pd.read_excel('/content/preprocessed_data.xlsx')
data.head()

Unnamed: 0,Age,Distance From Home,Employee Hourly Rate,Number of Companies Worked,Employee Last Salary Hike Percent,Total Work Experience In Years,Training Times Last Year,Experience Years At This Company,Experience Years In Current Role,Years Since Last Promotion,...,Employee Work Life Balance,Gender,Education Background,Marital Status,Employees Department,Employees Job Role,Business Travel Frequency,Over Time,Attrition,Performance Rating
0,32,10,55,1,12,10,2,10,7,0.0,...,2,1,2,2,5,13,2,0,0,3
1,47,14,42,2,12,20,2,7,7,1.0,...,3,1,2,2,5,13,2,0,0,3
2,40,5,48,5,21,20,2,18,13,1.0,...,3,1,1,1,5,13,1,1,0,4
3,41,10,73,3,15,23,2,21,6,3.464102,...,2,1,0,0,3,8,2,0,0,3
4,60,16,84,8,14,10,1,2,2,1.414214,...,3,1,2,2,5,13,2,0,0,3


# Feature Selection

In [76]:
# Splitting dataset into Dependent(X) and Independent(y) Variables

X = data.drop('Performance Rating', axis=1)
y = data['Performance Rating']

In [77]:
bestfeatures = SelectKBest(score_func=chi2,k=15)
fit = bestfeatures.fit(X,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

featurescores = pd.concat([dfcolumns,dfscores],axis=1)
featurescores.columns = ['feature_names','Scores']

print(featurescores.nlargest(5,'Scores'))

                        feature_names       Scores
4   Employee Last Salary Hike Percent  1251.369985
12  Employee Environment Satisfaction   456.884207
9          Years Since Last Promotion   352.363603
7    Experience Years At This Company   347.864952
8    Experience Years In Current Role   347.401176


These are the `Top 5 features` which impact on the `target variable` to `predict the rating`.

## Checking which Machine Learning Algorithm perform well for given dataset

In [90]:
features = data.columns[:-1]
test.rename({'':'Performance Rating'},axis = 1, inplace = True)

In [54]:
MLA = [
    # Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    # General Linear Models
    linear_model.LogisticRegression(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    # Naive Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),

    # Nearest Neighbor
    neighbors.KNeighborsClassifier(),

    # SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),

    # Trees
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),

    # Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    # XGBOOST
    XGBClassifier()
]

In [81]:
# Split dataset in cross-validation with this splitter class
# note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = 0.3, train_size = 0.7, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%


#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = data['Performance Rating']

In [87]:
# Index through MLA and save performance to table
row_index = 0
data1 = data.copy()
for alg in MLA:
    data = data1
    # Set name and parameters
    MLA_name = alg.__class__.__name__
    print('Executing ',MLA_name)
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    # Score model with cross validation
    cv_results = model_selection.cross_validate(alg, data[features], data['Performance Rating'], cv  = cv_split)
    # Print(cv_results.keys())
    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    # MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    # If this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    
    # Save MLA predictions - see section 6 for usage
    alg.fit(data[features], data['Performance Rating'])
    MLA_predict[MLA_name] = alg.predict(data[features])
    row_index+=1

MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
# MLA_predict

Executing  AdaBoostClassifier
Executing  BaggingClassifier
Executing  ExtraTreesClassifier
Executing  GradientBoostingClassifier
Executing  RandomForestClassifier
Executing  LogisticRegression
Executing  PassiveAggressiveClassifier
Executing  RidgeClassifierCV
Executing  SGDClassifier
Executing  Perceptron
Executing  BernoulliNB
Executing  GaussianNB
Executing  KNeighborsClassifier
Executing  SVC
Executing  NuSVC
Executing  LinearSVC
Executing  DecisionTreeClassifier
Executing  ExtraTreeClassifier
Executing  LinearDiscriminantAnalysis
Executing  QuadraticDiscriminantAnalysis
Executing  XGBClassifier


Unnamed: 0,MLA Name,MLA Parameters,MLA Test Accuracy Mean,MLA Test Accuracy 3*STD,MLA Time
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.98615,0.015573,0.308539
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.985133,0.018285,0.241554
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.974968,0.017137,1.388374
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.972173,0.023588,0.07448
16,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.970648,0.028853,0.010627
20,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.968361,0.013472,0.43402
17,ExtraTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.931766,0.03414,0.004897
19,QuadraticDiscriminantAnalysis,"{'priors': None, 'reg_param': 0.0, 'store_cova...",0.860356,0.044567,0.010779
18,LinearDiscriminantAnalysis,"{'covariance_estimator': None, 'n_components':...",0.803431,0.028681,0.015817
7,RidgeClassifierCV,"{'alphas': array([ 0.1, 1. , 10. ]), 'class_w...",0.802922,0.027827,0.016934


## Model training with all features
* By passing all features columns as X and performance rating (Target variable as Y).
* For Test data, model is getting `Accuracy score : 98%`.

While passing all feature, model is getting `98% accuracy`, Model might be overfitting & it will increases the complexity of model. So, we will pass selected features by applying feature selection technique.

In [93]:
X = data.drop('Performance Rating', axis=1)
y = data['Performance Rating']

In [94]:
train = X.loc[:,['Employee Last Salary Hike Percent','Employee Environment Satisfaction',
                       'Experience Years At This Company','Experience Years In Current Role','Years Since Last Promotion']]
test = y

# Spliting the dataset for training and test purpose

In [95]:
X_train,X_test,y_train,y_test = train_test_split(train,test,random_state=42,test_size=0.30)

In [96]:
len(train), len(test)

(2622, 2622)

# Machine Learning Algorithms

## ExtraTreeClassifier

In [97]:
# Initilize model
ext_clf = ExtraTreesClassifier() # random forest with default parameters
ext_clf.fit(X_train,y_train)
y_predict = ext_clf.predict(X_test)
y_train.value_counts()

3    624
4    615
2    596
Name: Performance Rating, dtype: int64

In [98]:
print(f"\033[1mTest accuracy_score : {round(accuracy_score(y_test,y_predict)*100,2)} %\n")
print("Test Classification Report\n")
print(classification_report(y_test,y_predict))

[1mTest accuracy_score : 95.81 %

Test Classification Report

              precision    recall  f1-score   support

           2       0.93      0.99      0.96       278
           3       0.98      0.88      0.93       250
           4       0.97      1.00      0.98       259

    accuracy                           0.96       787
   macro avg       0.96      0.96      0.96       787
weighted avg       0.96      0.96      0.96       787



### Checking train accuracy to observe overfitting

In [99]:
train_predict = ext_clf.predict(X_train)
print(f"\033[1mTrain accuracy_score : {round(accuracy_score(y_train,train_predict)*100,2)} %\n")
print("Train Classification Report\n")
print(classification_report(y_train,train_predict))

[1mTrain accuracy_score : 98.69 %

Train Classification Report

              precision    recall  f1-score   support

           2       0.98      1.00      0.99       596
           3       0.99      0.97      0.98       624
           4       0.99      1.00      0.99       615

    accuracy                           0.99      1835
   macro avg       0.99      0.99      0.99      1835
weighted avg       0.99      0.99      0.99      1835



Here, difference between train accuracy and test accuracy is `least`. So we can say that the model is `not overfitted`.

## Evaluation

In [100]:
print(f"\033[1mTest accuracy_score : {round(accuracy_score(y_test,y_predict)*100,2)} %\n")
print("Train Classification Report\n")
print(classification_report(y_test,y_predict))

[1mTest accuracy_score : 95.81 %

Train Classification Report

              precision    recall  f1-score   support

           2       0.93      0.99      0.96       278
           3       0.98      0.88      0.93       250
           4       0.97      1.00      0.98       259

    accuracy                           0.96       787
   macro avg       0.96      0.96      0.96       787
weighted avg       0.96      0.96      0.96       787



## RandomForestClassifier

In [103]:
# Initilize model
rf = RandomForestClassifier() # random forest with default parameters
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
y_train.value_counts()

3    624
4    615
2    596
Name: Performance Rating, dtype: int64

### Evaluation

In [104]:
print(f"\033[1mTest accuracy_score : {round(accuracy_score(y_test,y_predict)*100,2)} %\n")
print("Train Classification Report\n")
print(classification_report(y_test,y_predict))

[1mTest accuracy_score : 95.43 %

Train Classification Report

              precision    recall  f1-score   support

           2       0.93      0.99      0.96       278
           3       0.98      0.87      0.92       250
           4       0.96      1.00      0.98       259

    accuracy                           0.95       787
   macro avg       0.96      0.95      0.95       787
weighted avg       0.96      0.95      0.95       787



In [105]:
train_predict = rf.predict(X_train)
print(f"\033[1mTrain accuracy_score : {round(accuracy_score(y_train,train_predict)*100,2)} %\n")
print("Train Classification Report\n")
print(classification_report(y_train,train_predict))

[1mTrain accuracy_score : 98.69 %

Train Classification Report

              precision    recall  f1-score   support

           2       0.98      1.00      0.99       596
           3       0.99      0.97      0.98       624
           4       0.99      1.00      0.99       615

    accuracy                           0.99      1835
   macro avg       0.99      0.99      0.99      1835
weighted avg       0.99      0.99      0.99      1835



* Here, difference between train accuracy & test accuracy is `least`. SO we can say that nidek is `not overfitted`.
* The accuracy of `RandomForestClassifier model` is `less` compared to `ExtraTreeClassifier model`, So we should focus on `ExtraTreeClassifier model`.