### LOAD, CLEAN, AND PREPROCESS

In [91]:
import pandas as pd
import numpy as np

TRAINING = pd.read_csv("train.csv")
TESTING = pd.read_csv("test.csv") # Without Label
AUGMENTED = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv") # Additional data to augment training

In [92]:
TRAINING.describe()

Unnamed: 0,id,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
count,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,...,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0
mean,838.0,36.036971,892.749553,8.683959,2.937984,1.0,2.757901,67.79845,2.782349,1.998807,...,80.0,0.721527,10.7096,2.732856,2.769827,6.806798,4.144305,1.978533,4.186643,0.119261
std,484.252517,8.507112,374.496259,7.826143,1.039078,0.0,1.086835,19.435928,0.650664,1.083065,...,0.0,0.771259,7.255135,1.145271,0.647721,5.883282,3.583398,3.045716,3.576189,0.324192
min,0.0,18.0,107.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,419.0,30.0,589.0,2.0,2.0,1.0,2.0,51.0,2.0,1.0,...,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0,0.0
50%,838.0,35.0,890.0,7.0,3.0,1.0,3.0,69.0,3.0,2.0,...,80.0,1.0,9.0,3.0,3.0,5.0,3.0,1.0,3.0,0.0
75%,1257.0,41.0,1223.0,12.0,4.0,1.0,4.0,84.0,3.0,2.0,...,80.0,1.0,14.0,3.0,3.0,9.0,7.0,2.0,7.0,0.0
max,1676.0,60.0,3921.0,29.0,15.0,1.0,4.0,100.0,4.0,7.0,...,80.0,3.0,41.0,6.0,4.0,41.0,18.0,15.0,17.0,1.0


### DROP IDS AND DUPLICATES

In [93]:
# Drop ID columns
TRAINING = TRAINING.drop(columns=['id'])

# Drop duplplicates if it exists
TRAINING = TRAINING.drop_duplicates(subset=None, keep='first', inplace=False)
AUGMENTED = AUGMENTED.drop_duplicates(subset=None, keep='first', inplace=False)

# Make Attrition for Augmented be 0 and 1
AUGMENTED['Attrition'] = AUGMENTED['Attrition'].replace({'Yes':1,'No':0})

### DROP ONE DIMENSIONALS

In [94]:
def dropOneDim(data: pd.DataFrame, Label: str) -> (pd.DataFrame, list):
    one_dimensional_cols = [col for col in data.columns if data[col].nunique() == 1]
    print("Dropped columns:", one_dimensional_cols)  # Print dropped columns

    # Filter out one-dimensional columns
    remaining_columns = [col for col in data.columns if col not in one_dimensional_cols]
    
    # Exclude the label column from remaining columns
    remaining_columns = [col for col in remaining_columns if col != Label]

    # Return updated DataFrame and remaining columns
    return data[remaining_columns + [Label]], remaining_columns


In [95]:
# Drop one-dimensional features
TRAINING, remaining_columns = dropOneDim(TRAINING, "Attrition")
TESTING = TESTING[remaining_columns]
AUGMENTED, remaining_columns = dropOneDim(AUGMENTED, "Attrition")

Dropped columns: ['EmployeeCount', 'Over18', 'StandardHours']
Dropped columns: ['EmployeeCount', 'Over18', 'StandardHours']


### REORDER AND ENCODE THE ORDINAL FEATURE IN THE DATASET

In [96]:
ORDER = ["Non-Travel", "Travel_Rarely", "Travel_Frequently"]
TRAINING["BusinessTravel"] = pd.Categorical(TRAINING["BusinessTravel"], categories=ORDER, ordered=True)
AUGMENTED["BusinessTravel"] = pd.Categorical(AUGMENTED["BusinessTravel"], categories=ORDER, ordered=True)
TESTING["BusinessTravel"] = pd.Categorical(TESTING["BusinessTravel"], categories=ORDER, ordered=True)
# Encode as integers
TRAINING["BusinessTravel"] = TRAINING["BusinessTravel"].cat.codes.replace(-1, None)
AUGMENTED["BusinessTravel"] = AUGMENTED["BusinessTravel"].cat.codes.replace(-1, None)
TESTING["BusinessTravel"] = TESTING["BusinessTravel"].cat.codes.replace(-1, None)


### SPECIFY FEATURE TYPES

In [97]:
NUMERICS = TRAINING.select_dtypes(include=["number"]).columns.tolist()
CATEGORICALS = TRAINING.select_dtypes(include=["object", "category"]).columns.tolist()
print (f'Categorical features: {CATEGORICALS}')
print (f'Numeric Features: {NUMERICS}')

Categorical features: ['Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
Numeric Features: ['Age', 'BusinessTravel', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition']


### FILL IN MISSING OBSERVATIONS IN NUMERIC FEATURES

In [98]:
def impute_median(df, exclude_columns=[]):
    numeric_features = df.select_dtypes(include=['number']).columns
    numeric_features = [col for col in numeric_features if col not in exclude_columns]
    for column in numeric_features:
        median_value = df[column].median()
        df[column].fillna(median_value, inplace=True)

# Apply on df
impute_median(TRAINING, exclude_columns=['Attrition'])
impute_median(TESTING)
impute_median(AUGMENTED, exclude_columns=['Attrition'])

### HANDLE OUTLIERS BY CAPPING WITH MIN AND MAX

In [99]:
def cap_outliers(df, exclude_columns=[], lower_quantile=0.01, upper_quantile=0.99):
    numeric_features = df.select_dtypes(include=['number']).columns
    numeric_features = [col for col in numeric_features if col not in exclude_columns]
    for column in numeric_features:
        lower_bound = df[column].quantile(lower_quantile)
        upper_bound = df[column].quantile(upper_quantile)
        df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)

# Apply on DF
cap_outliers(TRAINING, exclude_columns=['Label'])
cap_outliers(TESTING)
cap_outliers(AUGMENTED, exclude_columns=['Label'])

In [100]:
TRAINING.describe()

Unnamed: 0,Age,BusinessTravel,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
count,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,...,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0,1677.0
mean,36.034586,1.080501,891.352415,8.683959,2.932021,2.757901,67.806202,2.782349,1.997615,2.79189,...,2.751342,0.721527,10.690519,2.732856,2.769827,6.77102,4.126416,1.971377,4.166369,0.119261
std,8.466838,0.473733,367.02888,7.826143,0.997685,1.086835,19.421036,0.650664,1.078647,1.097396,...,1.077905,0.771259,7.15188,1.145271,0.647721,5.728028,3.523125,3.016129,3.512727,0.324192
min,19.0,0.0,130.0,1.0,1.0,1.0,31.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,1.0,589.0,2.0,2.0,2.0,51.0,2.0,1.0,2.0,...,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0,0.0
50%,35.0,1.0,890.0,7.0,3.0,3.0,69.0,3.0,2.0,3.0,...,3.0,1.0,9.0,3.0,3.0,5.0,3.0,1.0,3.0,0.0
75%,41.0,1.0,1223.0,12.0,4.0,4.0,84.0,3.0,2.0,4.0,...,4.0,1.0,14.0,3.0,3.0,9.0,7.0,2.0,7.0,0.0
max,58.0,2.0,1485.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,...,4.0,3.0,33.0,6.0,4.0,29.0,15.0,14.0,14.0,1.0


### CHECK FOR EXTRA FEATURES REFLECTING IN AUGMENTED AND DROP

In [101]:
set(TRAINING.columns) - set(AUGMENTED.columns)
set(AUGMENTED.columns) - set(TRAINING.columns)

{'EmployeeNumber'}

In [102]:
# Dropping Employee number from Augmented dataframe
AUGMENTED = AUGMENTED.drop(columns=['EmployeeNumber'])

### EXTRACT FEATURES

In [103]:
# Derive new features
def feature_extraction(df: pd.DataFrame) -> pd.DataFrame:
    # df['AgeGroup'] = pd.cut(df['Age'], bins=[20, 30, 40, 50], labels=['20-30', '31-40', '41-50'])
    df['Tenure'] = df['Age'] - df['YearsAtCompany']
    df['IncomePerYear'] = df['MonthlyIncome'] / df['YearsAtCompany']
    df['JobSatisfactionRatio'] = df['JobSatisfaction'] / df['YearsAtCompany']
    # df['DistanceCategory'] = pd.cut(df['DistanceFromHome'], bins=[0, 10, 20, 30], labels=['0-10 km', '11-20 km', '21-30 km'])
    df['TotalTrainingTime'] = df['TrainingTimesLastYear'] * df['YearsAtCompany']
    df['AverageMonthlyRate'] = df['MonthlyRate'] / df['YearsAtCompany']
    df['JobRoleTenure'] = df['YearsInCurrentRole'] / df['YearsAtCompany']
    df['PromotionFrequency'] = df['YearsSinceLastPromotion'] / df['YearsAtCompany']

    return df

In [104]:
# Apply the function on dataset
NEW_TRAINING = feature_extraction(TRAINING)
NEW_TESTING = feature_extraction(TESTING)
NEW_AUGMENTED = feature_extraction(AUGMENTED)

### ENCODE CATEGORICALS WITH ONE-HOT ENCODING

In [105]:
# Perform one-hot encoding on categorical features
NEW_TRAINING = pd.get_dummies(NEW_TRAINING, columns=CATEGORICALS)
NEW_TESTING = pd.get_dummies(NEW_TESTING, columns=CATEGORICALS)
NEW_AUGMENTED = pd.get_dummies(NEW_AUGMENTED, columns=CATEGORICALS)

In [108]:
NEW_TRAINING = NEW_TRAINING.dropna()
NEW_AUGMENTED = NEW_AUGMENTED.dropna()


### DATA SAMPLING (STRATIFIED K FOLD) AND MODELLING

In [114]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, mean_squared_error
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [115]:
# Define the number of folds
n_splits = 5

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Define hyperparameters for grid search
catboost_params = {
    'iterations': [100, 200],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5]
}

xgboost_params = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

# Initialize models
catboost_model = CatBoostClassifier(silent=True)
xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Initialize GridSearchCV
catboost_grid = GridSearchCV(catboost_model, catboost_params, scoring='roc_auc', cv=skf)
xgboost_grid = GridSearchCV(xgboost_model, xgboost_params, scoring='roc_auc', cv=skf)

# Lists to store results
results = []

# Perform K-fold stratified sampling and augment the data
for fold, (train_index, test_index) in enumerate(skf.split(NEW_TRAINING, NEW_TRAINING['Attrition'])):
    # Split the training data into train and validation sets
    train_fold = NEW_TRAINING.iloc[train_index]
    val_fold = NEW_TRAINING.iloc[test_index]
    
    # Augment the training data with the augmentation data
    train_fold_augmented = pd.concat([train_fold, NEW_AUGMENTED], ignore_index=True)
    
    X_train = train_fold_augmented.drop('Attrition', axis=1)
    y_train = train_fold_augmented['Attrition']
    X_val = val_fold.drop('Attrition', axis=1)
    y_val = val_fold['Attrition']
    
    # Fit and evaluate CatBoost model
    catboost_grid.fit(X_train, y_train)
    catboost_best_model = catboost_grid.best_estimator_
    y_pred_catboost = catboost_best_model.predict(X_val)
    y_pred_proba_catboost = catboost_best_model.predict_proba(X_val)[:, 1]
    
    auc_catboost = roc_auc_score(y_val, y_pred_proba_catboost)
    f1_catboost = f1_score(y_val, y_pred_catboost)
    accuracy_catboost = accuracy_score(y_val, y_pred_catboost)
    mse_catboost = mean_squared_error(y_val, y_pred_catboost)

    results.append({
        'Fold': fold + 1,
        'Model': 'CatBoost',
        'Best Hyperparameters': catboost_grid.best_params_,
        'AUC': auc_catboost,
        'F1': f1_catboost,
        'Accuracy': accuracy_catboost,
        'MSE': mse_catboost
    })
    
    print(f"Fold {fold + 1} - CatBoost")
    print(f"Best Hyperparameters: {catboost_grid.best_params_}")
    print(f"AUC: {auc_catboost:.4f}, F1: {f1_catboost:.4f}, Accuracy: {accuracy_catboost:.4f}")
    
    # Fit and evaluate XGBoost model
    xgboost_grid.fit(X_train, y_train)
    xgboost_best_model = xgboost_grid.best_estimator_
    y_pred_xgboost = xgboost_best_model.predict(X_val)
    y_pred_proba_xgboost = xgboost_best_model.predict_proba(X_val)[:, 1]
    
    auc_xgboost = roc_auc_score(y_val, y_pred_proba_xgboost)
    f1_xgboost = f1_score(y_val, y_pred_xgboost)
    accuracy_xgboost = accuracy_score(y_val, y_pred_xgboost)
    mse_xgboost = mean_squared_error(y_val, y_pred_xgboost)

    results.append({
        'Fold': fold + 1,
        'Model': 'XGBoost',
        'Best Hyperparameters': xgboost_grid.best_params_,
        'AUC': auc_xgboost,
        'F1': f1_xgboost,
        'Accuracy': accuracy_xgboost,
        'MSE': mse_xgboost
    })
    
    print(f"Fold {fold + 1} - XGBoost")
    print(f"Best Hyperparameters: {xgboost_grid.best_params_}")
    print(f"AUC: {auc_xgboost:.4f}, F1: {f1_xgboost:.4f}, Accuracy: {accuracy_xgboost:.4f}")
    print("\n")

Fold 1 - CatBoost
Best Hyperparameters: {'depth': 4, 'iterations': 100, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
AUC: 0.8926, F1: 0.3774, Accuracy: 0.8985
Fold 1 - XGBoost
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.8}
AUC: 0.8623, F1: 0.4068, Accuracy: 0.8923


Fold 2 - CatBoost
Best Hyperparameters: {'depth': 8, 'iterations': 100, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
AUC: 0.7905, F1: 0.0000, Accuracy: 0.8800
Fold 2 - XGBoost
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.8}
AUC: 0.7971, F1: 0.1277, Accuracy: 0.8738


Fold 3 - CatBoost
Best Hyperparameters: {'depth': 4, 'iterations': 100, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
AUC: 0.8286, F1: 0.2326, Accuracy: 0.8985
Fold 3 - XGBoost
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.8}
AUC: 0.8159, F1: 0.3404, Accuracy: 0.9046


Fold 4 - CatBoost
Best Hyperparameters: {'depth': 4, 'i

### PRINT TRAINING RESULTS

In [116]:
# Create a DataFrame to display the results
results_df = pd.DataFrame(results)
print(results_df)

   Fold     Model                               Best Hyperparameters  \
0     1  CatBoost  {'depth': 4, 'iterations': 100, 'l2_leaf_reg':...   
1     1   XGBoost  {'learning_rate': 0.1, 'max_depth': 4, 'n_esti...   
2     2  CatBoost  {'depth': 8, 'iterations': 100, 'l2_leaf_reg':...   
3     2   XGBoost  {'learning_rate': 0.1, 'max_depth': 4, 'n_esti...   
4     3  CatBoost  {'depth': 4, 'iterations': 100, 'l2_leaf_reg':...   
5     3   XGBoost  {'learning_rate': 0.1, 'max_depth': 4, 'n_esti...   
6     4  CatBoost  {'depth': 4, 'iterations': 100, 'l2_leaf_reg':...   
7     4   XGBoost  {'learning_rate': 0.1, 'max_depth': 4, 'n_esti...   
8     5  CatBoost  {'depth': 4, 'iterations': 100, 'l2_leaf_reg':...   
9     5   XGBoost  {'learning_rate': 0.1, 'max_depth': 4, 'n_esti...   

        AUC        F1  Accuracy       MSE  
0  0.892637  0.377358  0.898462  0.101538  
1  0.862265  0.406780  0.892308  0.107692  
2  0.790465  0.000000  0.880000  0.120000  
3  0.797097  0.127660  0.873846

### EXTRACT BEST HYPER-PARAMETERS FROM BOTH MODELS

In [117]:
# Extract the best hyperparameters from the 4th fold
best_catboost_params = results_df.loc[(results_df['Fold'] == 4) & (results_df['Model'] == 'CatBoost'), 'Best Hyperparameters'].values[0]
best_xgboost_params = results_df.loc[(results_df['Fold'] == 4) & (results_df['Model'] == 'XGBoost'), 'Best Hyperparameters'].values[0]

print("Best CatBoost Hyperparameters:", best_catboost_params)
print("Best XGBoost Hyperparameters:", best_xgboost_params)

Best CatBoost Hyperparameters: {'depth': 4, 'iterations': 100, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
Best XGBoost Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'subsample': 0.8}


### TRAIN THE MODEL ON FULL + AUGMENTATION DATA

In [119]:
# Combine the training and augmentation data
full_train_data = pd.concat([NEW_TRAINING, NEW_AUGMENTED], ignore_index=True)
X_full_train = full_train_data.drop('Attrition', axis=1)
y_full_train = full_train_data['Attrition']

# Retrain CatBoost model with the best hyperparameters
catboost_best_model = CatBoostClassifier(**best_catboost_params, silent=True)
catboost_best_model.fit(X_full_train, y_full_train)



<catboost.core.CatBoostClassifier at 0x273a286bdd0>

In [120]:
# Retrain XGBoost model with the best hyperparameters
xgboost_best_model = XGBClassifier(**best_xgboost_params, use_label_encoder=False, eval_metric='logloss')
xgboost_best_model.fit(X_full_train, y_full_train)

### EVALUATE MODEL ON TEST SET.(WE CAN'T HAVE METRICS BECAUSE WE DO NOT HAVE LABEL)

In [3]:
X_test = NEW_TESTING
# Evaluate CatBoost model on the test set
y_pred_catboost_test = catboost_best_model.predict(X_test)
y_pred_proba_catboost_test = catboost_best_model.predict_proba(X_test)[:, 1]

# Evaluate XGBoost model on the test set
y_pred_xgboost_test = xgboost_best_model.predict(X_test)
y_pred_proba_xgboost_test = xgboost_best_model.predict_proba(X_test)[:, 1]


NameError: name 'NEW_TESTING' is not defined

### EXPORT SUBMISSION 

In [127]:
test = pd.read_csv("test.csv")
id_column = test['id']

# Combine the id column with the predicted probabilities
predictions_df = pd.DataFrame({
    'id': id_column,
    'Attrition': y_pred_proba_catboost_test
})

predictions_df.to_csv('catboost_predictions.csv', index=False)

**XGBOOST WAS ABLE TO GET 0.88 PRIVATE AND 0.92 PUBLIC SCORE ON KAGGLE**  
**CATBOOST WAS ABLE TO GET 0.89 PRIVATE AND 0.93 PUBLIC SCORE ON KAGGLE**

### SAVE MODEL FOR LATER

In [128]:
import joblib

# Save the CatBoost model
joblib.dump(catboost_best_model, 'catboost_best_model.pkl')

# Save the XGBoost model
joblib.dump(xgboost_best_model, 'xgboost_best_model.pkl')

['xgboost_best_model.pkl']

In [2]:
import joblib
joblib.load('catboost_best_model.pkl')

<catboost.core.CatBoostClassifier at 0x2116a9dff90>