In [10]:
import pandas as pd

from sklearn.model_selection import train_test_split

import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb 

from sklearn.metrics import accuracy_score

### TASK : Make a final decision on the model to be employed for this purpose
Based on your work done in the previous components and tasks in this Menternship, you must now make a final decision on which model must be used to predict employee performance.

You will be required to document the analysis you have from the application of the models in the precious component, and as to why the algorithm you have chosen makes the most sense.


In [24]:
scaled_data_standard = pd.read_csv('scaled_data_standard.csv', index_col='Unnamed: 0')


In [23]:
def acc_score(df):
    X = df.drop(['KPIs_met >80%'], axis=1)
    y = df['KPIs_met >80%']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the XGBoost model
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgb_model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_xgb = xgb_model.predict(X_test)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    


    # Train the CatBoost model
    catboost_model = CatBoostClassifier(silent=True)
    catboost_model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_catboost = catboost_model.predict(X_test)
    accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
    


    # Train the LightGBM model
    lgb_model = lgb.LGBMClassifier()
    lgb_model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_lgb = lgb_model.predict(X_test)
    accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
    
    #accuracy scores
    print(f"XGBoost Accuracy: {accuracy_xgb}")
    print(f"CatBoost Accuracy: {accuracy_catboost}")
    print(f"LightGBM Accuracy: {accuracy_lgb}")


In [21]:
acc_score(scaled_data_standard)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 6715, number of negative: 12077
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 206
[LightGBM] [Info] Number of data points in the train set: 18792, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357333 -> initscore=-0.586959
[LightGBM] [Info] Start training from score -0.586959
XGBoost Accuracy: 0.6998722860791826
CatBoost Accuracy: 0.7041294167730949
LightGBM Accuracy: 0.7075351213282248


---
# Employee Performance Prediction

## Introduction
This document outlines the process and decision-making involved in selecting the best machine learning model to predict employee performance. The target variable is the employee's performance rating, and the dataset includes various demographic and job-related features.

## Data Preparation
- **Cleaning:** Handled missing values and removed duplicates.
- **Encoding:** Used Label Encoding for the `region` column due to its high cardinality and Ordinal Encoding to `education`.
- **Scaling:** Applied Standard Scaling to numerical features.

## Model Evaluation
Three models were evaluated:
1. **XGBoost**
2. **CatBoost**
3. **LightGBM**

### Accuracy Results
- **XGBoost Accuracy:** `0.70`
- **CatBoost Accuracy:** `0.70`
- **LightGBM Accuracy:** `0.71`

## Comparison and Decision
### XGBoost
- **Pros:** Robust performance with tuning, handles various data types.
- **Cons:** Slower training, requires more tuning.

### CatBoost
- **Pros:** Natively handles categorical data, less tuning required, robust performance.
- **Cons:** Slower on extremely large datasets.

### LightGBM
- **Pros:** Fast training, excellent scalability.
- **Cons:** Sensitive to overfitting, requires careful tuning.

### Final Decision
CatBoost is chosen due to its ease of use, robust performance, and reduced risk of overfitting. It is expected to provide reliable predictions for employee performance with minimal preprocessing and tuning efforts.

## Conclusion
CatBoost is selected as the optimal model for predicting employee performance. Its advantages in handling categorical data and robust performance make it well-suited for this task.
