# Libraries for ML models 

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams
from matplotlib.pyplot import figure
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

## Importing processed data for ML models

In [2]:
df = pd.read_csv('processed.csv') #reading the data
df

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,0,32,1,2,2,5,13,2,10,3,...,4,10,2,2,10,7,0,8,0,3
1,1,47,1,2,2,5,13,2,14,4,...,4,20,2,3,7,7,1,7,0,3
2,2,40,1,1,1,5,13,1,5,4,...,3,20,2,3,18,13,1,12,0,4
3,3,41,1,0,0,3,8,2,10,4,...,2,23,2,2,21,6,12,6,0,3
4,4,60,1,2,2,5,13,2,16,4,...,4,10,1,3,2,2,2,2,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1195,27,0,3,0,5,13,1,3,1,...,2,6,3,3,6,5,0,4,0,4
1196,1196,37,1,1,2,1,15,2,10,2,...,1,4,2,3,1,0,0,0,0,3
1197,1197,50,1,3,1,1,15,2,28,1,...,3,20,3,3,20,8,3,8,0,3
1198,1198,34,0,3,2,0,1,2,9,3,...,2,9,3,4,8,7,7,7,0,3


In [3]:
df.info() #checking dtype

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   EmpNumber                     1200 non-null   int64
 1   Age                           1200 non-null   int64
 2   Gender                        1200 non-null   int64
 3   EducationBackground           1200 non-null   int64
 4   MaritalStatus                 1200 non-null   int64
 5   EmpDepartment                 1200 non-null   int64
 6   EmpJobRole                    1200 non-null   int64
 7   BusinessTravelFrequency       1200 non-null   int64
 8   DistanceFromHome              1200 non-null   int64
 9   EmpEducationLevel             1200 non-null   int64
 10  EmpEnvironmentSatisfaction    1200 non-null   int64
 11  EmpHourlyRate                 1200 non-null   int64
 12  EmpJobInvolvement             1200 non-null   int64
 13  EmpJobLevel                   120

- using PerformanceRating as target variable
- using all other columns as independent variable except EmpNumber cause since we have every other details about Employee we dont need EmpNumber

## Encoding target variable

- got this error whild running the models, to solve this I'm encoding target variable 
- ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [2 3 4]

In [4]:
LE = LabelEncoder()
df['PerformanceRating'] = LE.fit_transform(df['PerformanceRating'])
df['PerformanceRating'].unique()

array([1, 2, 0], dtype=int64)

## Indipendent and Dependent variable Assinnging


In [5]:
# Defining X and y and creating dependent and independent variables
X = df.drop(columns = ['EmpNumber','PerformanceRating'])   ## independent variable
y = df['PerformanceRating']    ## Dependent or target variable or the variable which we have to predict

## Scaling Indipendent variable


In [6]:
# Scaling is not required for Random forest clasiffier
# though doing it, incase other model requires it
scalar = StandardScaler()                         # objcet creation
X_scaled = scalar.fit_transform(X)                # scaling independent variable

## Spliting Training and Testing for Indipendent and Dependent variables


In [7]:
## dividing data into 80% for training and 20% testing 
X_train, X_test, y_train, y_test = train_test_split (X_scaled, y, test_size= 0.2, random_state = 42)

## Creating a function to test multiple models

In [8]:
def evaluate_classification_models(X_train, X_test, y_train, y_test):
    models = [BaggingClassifier(random_state=42), RandomForestClassifier(random_state=42), KNeighborsClassifier(), XGBClassifier(random_state=42), SVC(random_state=42), DecisionTreeClassifier(random_state=42)]
    model_names = ['Bagging Classifier', 'Random Forest Classifier', 'K-Nearest Neighbors Classifier', 'XGBoost Classifier', 'Support Vector Classifier', 'Decision Tree Classifier']
    results = pd.DataFrame(columns=['Model', 'Accuracy Score', 'Confusion Matrix', 'Classification Report', 'Cross Validation Score', 'Bagging Score'])
    
    for model, model_name in zip(models, model_names):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # metrics
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred, output_dict=True)
        cv_score = cross_val_score(model, X_train, y_train, cv=5).mean()
        bagging_model = BaggingClassifier(model,random_state=42)
        bagging_model.fit(X_train, y_train)
        y_pred_bagging = bagging_model.predict(X_test)
        bagging_accuracy = accuracy_score(y_test, y_pred_bagging)
        
        # Store the results in the DataFrame
        results = results.append({'Model': model_name, 'Accuracy Score': accuracy, 'Confusion Matrix': conf_matrix,
                                  'Classification Report': class_report, 'Cross Validation Score': cv_score, 
                                 'Bagging Score': bagging_accuracy }, ignore_index=True)
    return results


In [9]:
evaluate_classification_models(X_train, X_test, y_train, y_test)

Unnamed: 0,Model,Accuracy Score,Confusion Matrix,Classification Report,Cross Validation Score,Bagging Score
0,Bagging Classifier,0.929167,"[[25, 4, 0], [4, 178, 2], [0, 7, 20]]","{'0': {'precision': 0.8620689655172413, 'recal...",0.917708,0.95
1,Random Forest Classifier,0.945833,"[[26, 3, 0], [2, 181, 1], [0, 7, 20]]","{'0': {'precision': 0.9285714285714286, 'recal...",0.929167,0.95
2,K-Nearest Neighbors Classifier,0.7875,"[[11, 18, 0], [13, 170, 1], [2, 17, 8]]","{'0': {'precision': 0.4230769230769231, 'recal...",0.717708,0.791667
3,XGBoost Classifier,0.941667,"[[25, 4, 0], [3, 181, 0], [0, 7, 20]]","{'0': {'precision': 0.8928571428571429, 'recal...",0.928125,0.941667
4,Support Vector Classifier,0.841667,"[[9, 20, 0], [4, 180, 0], [0, 14, 13]]","{'0': {'precision': 0.6923076923076923, 'recal...",0.775,0.841667
5,Decision Tree Classifier,0.8875,"[[25, 4, 0], [8, 167, 9], [0, 6, 21]]","{'0': {'precision': 0.7575757575757576, 'recal...",0.883333,0.929167


- Since Random Forest Classifier has good accuracy score , bagging score, and cross validation score I'll hypertune it and find its future imporance and visualize it.

## hyper tunning Random Forest Classifier
- Using RandomizedSearchCV for hyper tunning

In [10]:
# Set up the parameters and distributions to sample from
param_dist = {'max_depth': [30, 40, 50, 70, None],
 'min_samples_leaf': [1, 2, 3],
 'min_samples_split': [2,3, 6],
 'n_estimators': [100,300,500,800,900]}

# Initialize the classifier
rfc = RandomForestClassifier()

# Use RandomizedSearchCV to search for the best combination of hyperparameters
rs = RandomizedSearchCV(rfc, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)

# Fit the model to the training data
rs.fit(X_train, y_train)

# Print the best combination of hyperparameters
print(rs.best_params_)

{'n_estimators': 900, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_depth': 30}


# Function to get future importance of model

In [11]:
def prediction(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
   #print('\033[1m'+'y_pred ''\033[0m'+' \n\n\t\t',y_pred)
    
# metrics
    print('')
    print('\033[1m'+'accuracy score :' '\033[0m'+' \n\n\t\t',accuracy_score(y_test, y_pred))                         
    print('')
    print('\033[1m'+'confusion matrix score :''\033[0m'+' \n\n',confusion_matrix(y_test, y_pred))
    print('')
    print('\033[1m'+'classification report:''\033[0m'+' \n\n' ,classification_report(y_test, y_pred))
    print('')
    print('\033[1m'+'Cross validation scores:''\033[0m'+' \n\n' ,cross_val_score(model, X_train, y_train, cv= 5))
    c=cross_val_score(model, X_train, y_train, cv= 5)
    print('')
    print('\033[1m'+'Cross validation scores mean:''\033[0m'+' \n\n' ,c.mean())
    print('')
    bag_model= BaggingClassifier(base_estimator=model,random_state=42)
    bag_model.fit(X_train,y_train)
    print('')
    bag_score=cross_val_score(bag_model, X_scaled,y,cv=5)
    print('')
    print('\033[1m'+'bagging scores : ''\033[0m'+' ',bag_score)
    print('')
    print('\033[1m'+'bagging scores mean : ''\033[0m'+' ',bag_score.mean())  

# to visualize feature Importance
    model.fit(X, y)
    importances = model.feature_importances_
    feature_importances = [(feature, importance) for feature, importance in zip(X.columns, importances)]
    feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)
    features = [x[0] for x in feature_importances]
    importances = [x[1] for x in feature_importances]
    plt.figure(figsize=(10,6)) 
    plt.barh(range(len(features)), importances, align='center')
    plt.yticks(range(len(features)), features)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    return

## Best model :  hypertunned Random Forest Classifier with future importance

In [None]:
prediction(RandomForestClassifier(n_estimators=900,
                                  min_samples_split=3,
                                  min_samples_leaf=1,
                                  max_depth=30,
                                  random_state=42))


[1maccuracy score :[0m 

		 0.9541666666666667

[1mconfusion matrix score :[0m 

 [[ 26   3   0]
 [  1 183   0]
 [  0   7  20]]

[1mclassification report:[0m 

               precision    recall  f1-score   support

           0       0.96      0.90      0.93        29
           1       0.95      0.99      0.97       184
           2       1.00      0.74      0.85        27

    accuracy                           0.95       240
   macro avg       0.97      0.88      0.92       240
weighted avg       0.96      0.95      0.95       240


[1mCross validation scores:[0m 

 [0.89583333 0.93229167 0.94791667 0.94791667 0.921875  ]

[1mCross validation scores mean:[0m 

 0.9291666666666666




#### Top 3 important futures for predicting the employee performance
    1. Employee Last Salary Hike Percentage
    2. Employment Environment Satisfaction
    3. Years Since Last Promotion

# End