In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:
# Load the Encoded Data:
data= pd.read_csv(r"Encoded_Data.csv")
data

Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,32,1,2,2,5,13,2,10,3,4,...,4,10,2,2,10,7,0,8,0,1
1,47,1,2,2,5,13,2,14,4,4,...,4,20,2,3,7,7,1,7,0,1
2,40,1,1,1,5,13,1,5,4,4,...,3,20,2,3,18,13,1,12,0,2
3,41,1,0,0,3,8,2,10,4,2,...,2,23,2,2,5,6,1,6,0,1
4,60,1,2,2,5,13,2,16,4,1,...,4,10,1,3,2,2,2,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,27,0,3,0,5,13,1,3,1,4,...,2,6,3,3,6,5,0,4,0,2
1196,37,1,1,2,1,15,2,10,2,4,...,1,4,2,3,1,0,0,0,0,1
1197,50,1,3,1,1,15,2,28,1,4,...,3,20,3,3,20,8,3,8,0,1
1198,34,0,3,2,0,1,2,9,3,4,...,2,9,3,4,8,7,7,7,0,1


In [3]:
# Display shape of data and Target column distribution:
print("Dataset Shape:", data.shape)
print("\nTarget Variable Distribution:")
print(data['PerformanceRating'].value_counts())

Dataset Shape: (1200, 27)

Target Variable Distribution:
PerformanceRating
1    874
0    194
2    132
Name: count, dtype: int64


In [4]:
# Split the data into Features and Target (X=Features, y=Target):
X = data.drop(['PerformanceRating'], axis=1)
y = data['PerformanceRating']

In [5]:
# Split the data into Train and Test Data:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


Training set shape: (960, 26)
Test set shape: (240, 26)


In [6]:
# For summary storage
results_summary = []


# --------------------
# Logistic Regression
# --------------------
log_reg_pipeline = ImbPipeline([
("scaler", StandardScaler()),
("smote", SMOTE(random_state=42)),
("pca", PCA(n_components=10)),
("clf", LogisticRegression(max_iter=1000))
])


log_reg_params = {
"clf__C": [0.01, 0.1, 1, 10],
"clf__solver": ["lbfgs", "saga"]
}


grid_log = GridSearchCV(log_reg_pipeline, log_reg_params, cv=5, scoring="f1_macro", n_jobs=-1)
grid_log.fit(X_train, y_train)


log_y_pred = grid_log.predict(X_test)


print("\n===== Logistic Regression =====")
print("Training F1 Score:", f1_score(y_train, grid_log.predict(X_train), average="macro"))
print("\nTest Classification Report:\n", classification_report(y_test, log_y_pred))


cv_scores_log = cross_val_score(grid_log.best_estimator_, X_train, y_train, cv=5, scoring="f1_macro")
print("All folds CV F1:", cv_scores_log)
print("Mean CV F1:", np.mean(cv_scores_log))
print("Best Params:", grid_log.best_params_)
print("Best Score:", grid_log.best_score_)


results_summary.append(["Logistic Regression", np.mean(cv_scores_log), grid_log.best_params_])


===== Logistic Regression =====
Training F1 Score: 0.6341607740551063

Test Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.69      0.49        39
           1       0.90      0.68      0.78       175
           2       0.51      0.73      0.60        26

    accuracy                           0.69       240
   macro avg       0.60      0.70      0.62       240
weighted avg       0.77      0.69      0.71       240

All folds CV F1: [0.64602812 0.58561472 0.63523078 0.60642656 0.60642126]
Mean CV F1: 0.6159442882789005
Best Params: {'clf__C': 1, 'clf__solver': 'lbfgs'}
Best Score: 0.6159442882789005


In [7]:
# --------------------
# Random Forest
# --------------------
rf_pipeline = ImbPipeline([
("clf", RandomForestClassifier(class_weight="balanced", random_state=42))
])


rf_params = {
"clf__n_estimators": [100, 200],
"clf__max_depth": [5, 10, None]
}


grid_rf = GridSearchCV(rf_pipeline, rf_params, cv=5, scoring="f1_macro", n_jobs=-1)
grid_rf.fit(X_train, y_train)


rf_y_pred = grid_rf.predict(X_test)


print("\n===== Random Forest =====")
print("Training F1 Score:", f1_score(y_train, grid_rf.predict(X_train), average="macro"))
print("\nTest Classification Report:\n", classification_report(y_test, rf_y_pred))


cv_scores_rf = cross_val_score(grid_rf.best_estimator_, X_train, y_train, cv=5, scoring="f1_macro")
print("All folds CV F1:", cv_scores_rf)
print("Mean CV F1:", np.mean(cv_scores_rf))
print("Best Params:", grid_rf.best_params_)
print("Best Score:", grid_rf.best_score_)


# Top 5 important features
importances = grid_rf.best_estimator_.named_steps['clf'].feature_importances_
feat_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print("\nTop 5 Important Features that affect Performance:")
print(feat_importances.head(5))


results_summary.append(["Random Forest", np.mean(cv_scores_rf), grid_rf.best_params_])


===== Random Forest =====
Training F1 Score: 0.9859337671195442

Test Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.85      0.89        39
           1       0.93      0.99      0.96       175
           2       1.00      0.73      0.84        26

    accuracy                           0.94       240
   macro avg       0.96      0.86      0.90       240
weighted avg       0.94      0.94      0.94       240

All folds CV F1: [0.88084366 0.8868551  0.8837963  0.86124418 0.91971917]
Mean CV F1: 0.8864916828841652
Best Params: {'clf__max_depth': 10, 'clf__n_estimators': 200}
Best Score: 0.8864916828841652

Top 5 Important Features:
EmpLastSalaryHikePercent      0.243165
EmpEnvironmentSatisfaction    0.221864
YearsSinceLastPromotion       0.086998
EmpJobRole                    0.039647
EmpHourlyRate                 0.036271
dtype: float64


In [8]:
# MLP Classifier
# --------------------
mlp_pipeline = ImbPipeline([
("scaler", StandardScaler()),
("smote", SMOTE(random_state=42)),
("clf", MLPClassifier(max_iter=500, random_state=42))
])


mlp_params = {
"clf__hidden_layer_sizes": [(50,), (100,), (100,50)],
"clf__activation": ["relu", "tanh"],
"clf__alpha": [0.0001, 0.001]
}


grid_mlp = GridSearchCV(mlp_pipeline, mlp_params, cv=5, scoring="f1_macro", n_jobs=-1)
grid_mlp.fit(X_train, y_train)


mlp_y_pred = grid_mlp.predict(X_test)


print("\n===== MLP Classifier =====")
print("Training F1 Score:", f1_score(y_train, grid_mlp.predict(X_train), average="macro"))
print("\nTest Classification Report:\n", classification_report(y_test, mlp_y_pred))


cv_scores_mlp = cross_val_score(grid_mlp.best_estimator_, X_train, y_train, cv=5, scoring="f1_macro")
print("All folds CV F1:", cv_scores_mlp)
print("Mean CV F1:", np.mean(cv_scores_mlp))
print("Best Params:", grid_mlp.best_params_)
print("Best Score:", grid_mlp.best_score_)


results_summary.append(["MLP Classifier", np.mean(cv_scores_mlp), grid_mlp.best_params_])


===== MLP Classifier =====
Training F1 Score: 1.0

Test Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.56      0.66        39
           1       0.87      0.93      0.90       175
           2       0.62      0.58      0.60        26

    accuracy                           0.83       240
   macro avg       0.76      0.69      0.72       240
weighted avg       0.83      0.83      0.83       240

All folds CV F1: [0.78347636 0.73955482 0.66261145 0.73265073 0.66754974]
Mean CV F1: 0.7171686204411287
Best Params: {'clf__activation': 'relu', 'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (100,)}
Best Score: 0.7171686204411287


In [9]:
# --------------------
# Summary Table
# --------------------
summary_df = pd.DataFrame(results_summary, columns=["Model", "Mean CV F1", "Best Params"])
print("\n===== Summary Table =====")
print(summary_df)


===== Summary Table =====
                 Model  Mean CV F1  \
0  Logistic Regression    0.615944   
1        Random Forest    0.886492   
2       MLP Classifier    0.717169   

                                         Best Params  
0              {'clf__C': 1, 'clf__solver': 'lbfgs'}  
1   {'clf__max_depth': 10, 'clf__n_estimators': 200}  
2  {'clf__activation': 'relu', 'clf__alpha': 0.00...  


In [11]:
# Best Model Selection
# --------------------
best_model_name = summary_df.sort_values(by="Mean CV F1", ascending=False).iloc[0,0]
print("\nBest Model:", best_model_name)


if best_model_name == "Logistic Regression":
    best_model = grid_log.best_estimator_
elif best_model_name == "Random Forest":
    best_model = grid_rf.best_estimator_
else:
    best_model = grid_mlp.best_estimator_


Best Model: Random Forest


In [12]:
# Test best model on 20 random samples
# --------------------
random_samples = X_test.sample(20, random_state=42)
random_indices = random_samples.index
y_actual = y_test.loc[random_indices]
y_pred = best_model.predict(random_samples)


final_results = pd.DataFrame({
"Index(from csv)": random_indices,
"Actual": y_actual,
"Predicted": y_pred
})


print("\n===== Testing Best Model on 20 Random Samples =====")
print(final_results.reset_index(drop=True))


===== Testing Best Model on 20 Random Samples =====
    Index(from csv)  Actual  Predicted
0                43       1          1
1               479       0          0
2               289       1          1
3               292       1          1
4               321       1          1
5               668       1          1
6               263       1          1
7               994       1          1
8              1136       1          1
9               452       0          0
10               60       2          2
11              448       1          1
12              394       1          1
13              302       2          1
14              290       1          1
15              860       1          1
16              567       1          1
17              390       1          1
18              726       1          1
19             1101       1          1


# RECOMMENDATIONS Based on the analysis and model insights:

1. FOCUS ON KEY PERFORMANCE DRIVERS:
   - Monitor and improve the top 3 identified factors affecting performance
   - Implement targeted interventions for these critical areas

2. DEPARTMENT-SPECIFIC STRATEGIES:
   - Implement tailored improvement programs for underperforming departments
   - Share best practices from high-performing departments

3. EMPLOYEE DEVELOPMENT:
   - Provide training programs focused on identified weak areas
   - Implement mentoring programs for employees with lower performance ratings
   
4. WORK ENVIRONMENT OPTIMIZATION:
   - Improve job satisfaction factors identified as important
   - Address work-life balance issues if identified as significant
   
5. DATA-DRIVEN MONITORING:
   - Implement regular performance monitoring using the identified key factors
   - Create dashboards for real-time performance tracking

6. RETENTION STRATEGIES:
   - Address factors contributing to attrition if correlated with performance
   - Implement proactive measures for employees predicted to have declining performance

7. CONTINUOUS IMPROVEMENT:
   - Regularly retrain models with new data
   - Update performance prediction strategies based on evolving patterns
   - Conduct periodic analysis to identify new performance drivers

**End of Notebook**