In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('df_cleaned.csv', index_col='Unnamed: 0')

In [5]:

df.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Technology,region_26,Bachelor's,m,sourcing,1,24,3.0,1,1,0,77
1,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


### TASK : Data Scaling
Scaling is a technique to standardize the independent features present in the data in a fixed range. It is performed during the data pre-processing.

In this task, We use the below-mentioned Scalers to check their effect on our data and choose the most appropriate Scaler based on the results. More on Scalers and their purpose has been provided as a resource in the resource hub.

RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler

Create a function to run the given four scalers on your dataset and return the resultant scaled dataset. Make changes to parameters in such a way that you can iterate over various scalers to get datasets.

In [6]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler

def scale_data(data, scaler_type='StandardScaler'):
    """
    Scales the data using the specified scaler type.
    
    Parameters:
    - data (pd.DataFrame): The input data to scale.
    - scaler_type (str): The type of scaler to use. One of ['RobustScaler', 'StandardScaler', 'MinMaxScaler', 'MaxAbsScaler'].
    
    Returns:
    - scaled_data (pd.DataFrame): The scaled data.
    """
    scalers = {
        'RobustScaler': RobustScaler(),
        'StandardScaler': StandardScaler(),
        'MinMaxScaler': MinMaxScaler(),
        'MaxAbsScaler': MaxAbsScaler()
    }
    
    if scaler_type not in scalers:
        raise ValueError(f"Invalid scaler_type. Expected one of {list(scalers.keys())}")
    
    scaler = scalers[scaler_type]
    
    # Selecting only numerical columns for scaling
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
    data_to_scale = data[numerical_cols]
    
    # Scaling the data
    scaled_data = scaler.fit_transform(data_to_scale)
    scaled_df = pd.DataFrame(scaled_data, columns=numerical_cols)
    
    # Replacing the original columns with scaled columns
    data_scaled = data.copy()
    data_scaled[numerical_cols] = scaled_df
    
    return data_scaled

In [7]:
scaled_data_standard = scale_data(df, scaler_type='StandardScaler')
scaled_data_robust = scale_data(df, scaler_type='RobustScaler')
scaled_data_minmax = scale_data(df, scaler_type='MinMaxScaler')
scaled_data_maxabs = scale_data(df, scaler_type='MaxAbsScaler')


In [8]:
print("Standard Scaled Data:")
print(scaled_data_standard.head())


Standard Scaled Data:
          department     region   education gender recruitment_channel  \
0         Technology  region_26  Bachelor's      m            sourcing   
1                 HR   region_4  Bachelor's      f               other   
2  Sales & Marketing  region_13  Bachelor's      m               other   
3        Procurement   region_2  Bachelor's      f               other   
4            Finance  region_29  Bachelor's      m            sourcing   

   no_of_trainings       age  previous_year_rating  length_of_service  \
0        -0.423094 -1.404150             -0.257191          -1.143200   
1        -0.423094 -0.492612             -0.257191          -0.192590   
2        -0.423094 -0.492612             -1.900667          -0.430243   
3         2.905264 -0.492612             -1.078929           0.758019   
4        -0.423094 -0.622832              0.564547           0.282714   

   KPIs_met >80%  awards_won?  avg_training_score  
0       1.336715    -0.152665            1

In [9]:
print("\nRobust Scaled Data:")
print(scaled_data_robust.head())


Robust Scaled Data:
          department     region   education gender recruitment_channel  \
0         Technology  region_26  Bachelor's      m            sourcing   
1                 HR   region_4  Bachelor's      f               other   
2  Sales & Marketing  region_13  Bachelor's      m               other   
3        Procurement   region_2  Bachelor's      f               other   
4            Finance  region_29  Bachelor's      m            sourcing   

   no_of_trainings  age  previous_year_rating  length_of_service  \
0              0.0 -0.9                   0.0              -1.00   
1              0.0 -0.2                   0.0               0.00   
2              0.0 -0.2                  -2.0              -0.25   
3              2.0 -0.2                  -1.0               1.00   
4              0.0 -0.3                   1.0               0.50   

   KPIs_met >80%  awards_won?  avg_training_score  
0            1.0          0.0                0.68  
1            0.0     

In [10]:
print("\nMinMax Scaled Data:")
print(scaled_data_minmax.head())


MinMax Scaled Data:
          department     region   education gender recruitment_channel  \
0         Technology  region_26  Bachelor's      m            sourcing   
1                 HR   region_4  Bachelor's      f               other   
2  Sales & Marketing  region_13  Bachelor's      m               other   
3        Procurement   region_2  Bachelor's      f               other   
4            Finance  region_29  Bachelor's      m            sourcing   

   no_of_trainings    age  previous_year_rating  length_of_service  \
0             0.00  0.100                  0.50           0.000000   
1             0.00  0.275                  0.50           0.121212   
2             0.00  0.275                  0.00           0.090909   
3             0.25  0.275                  0.25           0.242424   
4             0.00  0.250                  0.75           0.181818   

   KPIs_met >80%  awards_won?  avg_training_score  
0            1.0          0.0            0.633333  
1        

In [11]:
print("\nMaxAbs Scaled Data:")
print(scaled_data_maxabs.head())


MaxAbs Scaled Data:
          department     region   education gender recruitment_channel  \
0         Technology  region_26  Bachelor's      m            sourcing   
1                 HR   region_4  Bachelor's      f               other   
2  Sales & Marketing  region_13  Bachelor's      m               other   
3        Procurement   region_2  Bachelor's      f               other   
4            Finance  region_29  Bachelor's      m            sourcing   

   no_of_trainings       age  previous_year_rating  length_of_service  \
0         0.111111  0.400000                   0.6           0.029412   
1         0.111111  0.516667                   0.6           0.147059   
2         0.111111  0.516667                   0.2           0.117647   
3         0.333333  0.516667                   0.4           0.264706   
4         0.111111  0.500000                   0.8           0.205882   

   KPIs_met >80%  awards_won?  avg_training_score  
0            1.0          0.0            0.

### Handling categorical values


In [60]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

le = LabelEncoder()
oe = OrdinalEncoder()

In [42]:
df['department'] = le.fit_transform(df['department'])
df['region'] = le.fit_transform(df['region'])
df['gender'] = le.fit_transform(df['gender'])
df['recruitment_channel'] = le.fit_transform(df['recruitment_channel'])

In [63]:
df['education'] = oe.fit_transform(df[['education']])

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23490 entries, 0 to 23489
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   department            23490 non-null  int32  
 1   region                23490 non-null  int32  
 2   education             23490 non-null  float64
 3   gender                23490 non-null  int32  
 4   recruitment_channel   23490 non-null  int32  
 5   no_of_trainings       23490 non-null  int64  
 6   age                   23490 non-null  int64  
 7   previous_year_rating  23490 non-null  float64
 8   length_of_service     23490 non-null  int64  
 9   KPIs_met >80%         23490 non-null  int64  
 10  awards_won?           23490 non-null  int64  
 11  avg_training_score    23490 non-null  int64  
dtypes: float64(2), int32(4), int64(6)
memory usage: 2.0 MB


In [78]:
df.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8,18,0.0,1,2,1,24,3.0,1,1,0,77
1,2,28,0.0,0,0,1,31,3.0,5,0,0,51
2,7,4,0.0,1,0,1,31,1.0,4,0,0,47
3,5,11,0.0,0,0,3,31,2.0,9,0,0,65
4,1,21,0.0,1,2,1,30,4.0,7,0,0,61


### TASK : Create Baseline ML Model for Binary Classification Problem
In this task:

We will be using 3 models for this problem -  XGBoost, CatBoost, LightGBM
- Run the dataset through XGBoost to predict the target variable and check the accuracy
- Run the dataset through CatBoost to predict the target variable and check the accuracy
- Run the dataset through LightGBM to predict the target variable and check the accuracy


In [73]:
X = df.drop(['KPIs_met >80%'], axis=1)
y = df['KPIs_met >80%']

In [74]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [75]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb}")


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.6998722860791826


In [76]:
from catboost import CatBoostClassifier

# Train the CatBoost model
catboost_model = CatBoostClassifier(silent=True)
catboost_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_catboost = catboost_model.predict(X_test)
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"CatBoost Accuracy: {accuracy_catboost}")


CatBoost Accuracy: 0.7041294167730949


In [77]:
import lightgbm as lgb

# Train the LightGBM model
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_lgb = lgb_model.predict(X_test)
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print(f"LightGBM Accuracy: {accuracy_lgb}")


[LightGBM] [Info] Number of positive: 6715, number of negative: 12077
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 18792, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357333 -> initscore=-0.586959
[LightGBM] [Info] Start training from score -0.586959
LightGBM Accuracy: 0.7075351213282248


TASK
Report Results
In this task, you need to check for various scalers and check which gives better accuracy.

You can follow the below steps:

Iterate over the data scaling functions and ML Models to understand which model under which scaling technique is giving better results using the function in Task 1 and the baseline models in Task 2.
Document the accuracy using Confusion matrices and determine the best Scaling Function and best model for your dataset.

In [None]:
# Train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb}")


# Train the CatBoost model
catboost_model = CatBoostClassifier(silent=True)
catboost_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_catboost = catboost_model.predict(X_test)
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"CatBoost Accuracy: {accuracy_catboost}")


# Train the LightGBM model
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_lgb = lgb_model.predict(X_test)
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print(f"LightGBM Accuracy: {accuracy_lgb}")


In [90]:
scaled_data_standard = scale_data(df, scaler_type='StandardScaler')
scaled_data_minmax = scale_data(df, scaler_type='MinMaxScaler')
scaled_data_robust = scale_data(df, scaler_type='RobustScaler')
scaled_data_maxabs = scale_data(df, scaler_type='MaxAbsScaler')


In [92]:
df.head(2)

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8,18,0.0,1,2,1,24,3.0,1,1,0,77
1,2,28,0.0,0,0,1,31,3.0,5,0,0,51


In [103]:
scaled_data_standard['KPIs_met >80%'] = df['KPIs_met >80%']


### 1. StandardScaler

In [104]:
scaled_data_standard.head(2)

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8,18,-0.637366,1,2,-0.423094,-1.40415,-0.257191,-1.1432,1,-0.152665,1.024263
1,2,28,-0.637366,0,0,-0.423094,-0.492612,-0.257191,-0.19259,0,-0.152665,-0.914377


In [84]:
def scaled_modelling(df):
    X = df.drop(['KPIs_met >80%'], axis=1)
    y = df['KPIs_met >80%']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the XGBoost model
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    xgb_model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_xgb = xgb_model.predict(X_test)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    print(f"XGBoost Accuracy: {accuracy_xgb}")


    # Train the CatBoost model
    catboost_model = CatBoostClassifier(silent=True)
    catboost_model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_catboost = catboost_model.predict(X_test)
    accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
    print(f"CatBoost Accuracy: {accuracy_catboost}")


    # Train the LightGBM model
    lgb_model = lgb.LGBMClassifier()
    lgb_model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_lgb = lgb_model.predict(X_test)
    accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
    print(f"LightGBM Accuracy: {accuracy_lgb}")


In [105]:
scaled_modelling(scaled_data_standard)

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.6998722860791826
CatBoost Accuracy: 0.7041294167730949
[LightGBM] [Info] Number of positive: 6715, number of negative: 12077
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 206
[LightGBM] [Info] Number of data points in the train set: 18792, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357333 -> initscore=-0.586959
[LightGBM] [Info] Start training from score -0.586959
LightGBM Accuracy: 0.7075351213282248


### 2. MinMaxScaler

In [108]:
scaled_data_minmax.head(2)

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8,18,0.0,1,2,0.0,0.1,0.5,0.0,1.0,0.0,0.633333
1,2,28,0.0,0,0,0.0,0.275,0.5,0.121212,0.0,0.0,0.2


In [109]:
scaled_modelling(scaled_data_minmax)

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.6998722860791826
CatBoost Accuracy: 0.7041294167730949
[LightGBM] [Info] Number of positive: 6715, number of negative: 12077
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 200
[LightGBM] [Info] Number of data points in the train set: 18792, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357333 -> initscore=-0.586959
[LightGBM] [Info] Start training from score -0.586959
LightGBM Accuracy: 0.7075351213282248


### 3. RobustScaler

In [111]:
scaled_data_robust.head(2)

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8,18,0.0,1,2,0.0,-0.9,0.0,-1.0,1.0,0.0,0.68
1,2,28,0.0,0,0,0.0,-0.2,0.0,0.0,0.0,0.0,-0.36


In [112]:
scaled_modelling(scaled_data_robust)

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.6998722860791826
CatBoost Accuracy: 0.7041294167730949
[LightGBM] [Info] Number of positive: 6715, number of negative: 12077
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 199
[LightGBM] [Info] Number of data points in the train set: 18792, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357333 -> initscore=-0.586959
[LightGBM] [Info] Start training from score -0.586959
LightGBM Accuracy: 0.7075351213282248


### 4. MaxAbsScaler

In [114]:
scaled_data_maxabs.head(2)

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8,18,0.0,1,2,0.111111,0.4,0.6,0.029412,1.0,0.0,0.777778
1,2,28,0.0,0,0,0.111111,0.516667,0.6,0.147059,0.0,0.0,0.515152


In [115]:
scaled_modelling(scaled_data_maxabs)

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.6998722860791826
CatBoost Accuracy: 0.7041294167730949
[LightGBM] [Info] Number of positive: 6715, number of negative: 12077
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 204
[LightGBM] [Info] Number of data points in the train set: 18792, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.357333 -> initscore=-0.586959
[LightGBM] [Info] Start training from score -0.586959
LightGBM Accuracy: 0.7075351213282248
