In [3]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from matplotlib.colors import ListedColormap
import seaborn as sns
import warnings; warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
# from catboost import CatBoostRegressor

In [4]:
# Step 2: Load and Explore Data
# Load datasets
X_train = pd.read_csv("../04_modelling/dataset/X_train.csv")
y_train = pd.read_csv("../04_modelling/dataset/y_train.csv")
# X_val = pd.read_csv("../04_modelling/dataset/X_val.csv")
# y_val = pd.read_csv("../04_modelling/dataset/y_val.csv")
X_test = pd.read_csv("../04_modelling/dataset/X_test.csv")
y_test = pd.read_csv("../04_modelling/dataset/y_test.csv")

In [5]:
X_train.shape

(5459, 50)

In [6]:
target_names = y_train['yearly_compensation']

In [7]:
target_names = target_names.to_numpy()

In [70]:
# def run_classifier(clf, param_grid, title):
#     # -----------------------------------------------------
#     cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
#     # Randomized grid search
#     n_iter_search = 10
#     gs = RandomizedSearchCV(
#         clf,
#         param_distributions=param_grid,
#         n_iter=n_iter_search,
#         cv=cv,
#         scoring='accuracy'
#     )
#     # -----------------------------------------------------
#     # Train model
#     gs.fit(X_train, y_train)  
#     print("The best parameters are %s" % (gs.best_params_)) 
#     # Predict on test set
#     y_pred = gs.best_estimator_.predict(X_test)
#     # Get Probability estimates
#     y_prob = gs.best_estimator_.predict_proba(X_test)[:, 1]
#     # -----------------------------------------------------
#     print('Accuracy score: %.2f%%' % (accuracy_score(y_test, y_pred)*100))  
#     print('Precision score: %.2f%%' % (precision_score(y_test, y_pred, average='weighted')*100))
#     print('Recall score: %.2f%%' % (recall_score(y_test, y_pred, average='weighted')*100))
#     # -----------------------------------------------------
    # Plot confusion matrix
    # fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(10, 5))
    # cm = confusion_matrix(y_test, y_pred)
    
    # # Ensure target names match unique classes
    # target_names = sorted(set(y_test))  # Unique classes
    
    # sns.heatmap(cm, annot=True, cbar=False, fmt="d", linewidths=.5, cmap="Blues", ax=ax1)
    # ax1.set_title("Confusion Matrix")
    # ax1.set_xlabel("Predicted class")
    # ax1.set_ylabel("Actual class")
    # ax1.set_xticklabels(target_names, rotation=45)
    # ax1.set_yticklabels(target_names)
    # fig.tight_layout()

In [71]:
def run_classifier(clf, param_grid, title):
    # -----------------------------------------------------
    # Cross-Validation Setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)  # 5-fold CV for better generalization
    n_iter_search = 10  # Number of parameter combinations for RandomizedSearch
    
    gs = RandomizedSearchCV(
        clf,
        param_distributions=param_grid,
        n_iter=n_iter_search,
        cv=cv,
        scoring='accuracy',
        return_train_score=True
    )
    # -----------------------------------------------------
    # Perform Cross-Validation and Hyperparameter Tuning
    gs.fit(X_train, y_train)
    print(f"\n--- Cross-Validation Results ({title}) ---")
    print("The best parameters are:", gs.best_params_)
    print("Mean cross-validation accuracy: %.2f%%" % (gs.best_score_ * 100))
    
    # -----------------------------------------------------
    # Evaluate Model on Test and Validation Sets
    print("\n--- Test and Validation Results ---")
    
    # Predict on Test Set
    y_test_pred = gs.best_estimator_.predict(X_test)
    y_test_prob = gs.best_estimator_.predict_proba(X_test)[:, 1]
    
    # Predict on Validation Set
    # y_val_pred = gs.best_estimator_.predict(X_val)
    # y_val_prob = gs.best_estimator_.predict_proba(X_val)[:, 1]
    
    # Test Set Metrics
    print("\n--- Test Metrics ---")
    print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_test_pred) * 100))
    print('Precision: %.2f%%' % (precision_score(y_test, y_test_pred, average='weighted') * 100))
    print('Recall: %.2f%%' % (recall_score(y_test, y_test_pred, average='weighted') * 100))
    
    # Validation Set Metrics
    # print("\n--- Validation Metrics ---")
    # print('Accuracy: %.2f%%' % (accuracy_score(y_val, y_val_pred) * 100))
    # print('Precision: %.2f%%' % (precision_score(y_val, y_val_pred, average='weighted') * 100))
    # print('Recall: %.2f%%' % (recall_score(y_val, y_val_pred, average='weighted') * 100))
    
    # -----------------------------------------------------
    # Confusion Matrices (optional for analysis)
    # print("\nConfusion Matrix (Test):")
    # print(confusion_matrix(y_test, y_test_pred))
    # print("\nConfusion Matrix (Validation):")
    # print(confusion_matrix(y_val, y_val_pred))

In [12]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def run_regressor(regressor, param_grid, title):
    # -----------------------------------------------------
    # Cross-Validation Setup
    cv = KFold(n_splits=5, shuffle=True, random_state=123)  # K-Fold CV for regression
    n_iter_search = 10  # Number of parameter combinations for RandomizedSearch

    gs = RandomizedSearchCV(
        regressor,
        param_distributions=param_grid,
        n_iter=n_iter_search,
        cv=cv,
        scoring='neg_root_mean_squared_error',  # For regression, use RMSE
        return_train_score=True
    )
    # -----------------------------------------------------
    # Perform Cross-Validation and Hyperparameter Tuning
    gs.fit(X_train, y_train)
    print(f"\n--- Cross-Validation Results ({title}) ---")
    print("The best parameters are:", gs.best_params_)
    print("Mean cross-validation RMSE: %.4f" % (-gs.best_score_))
    
    # -----------------------------------------------------
    # Evaluate Model on Test Set
    print("\n--- Test Results ---")
    
    # Predict on Test Set
    y_test_pred = gs.best_estimator_.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae = mean_absolute_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)
    # mape = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100  # Mean Absolute Percentage Error

    print('Test RMSE: %.4f' % rmse)
    print('Test MAE: %.4f' % mae)
    print('Test R² (Accuracy): %.2f%%' % (r2 * 100))
    # print('Test MAPE: %.2f%%' % mape)

In [13]:
# Define LightGBM parameters for tuning
param_grid_lgbm = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 500, 1000],
    'bagging_fraction': [0.5, 0.6, 0.8],
    'feature_fraction': [0.5, 0.7, 0.9]
}

# Initialize LightGBM regressor
lgbm_regressor = lgb.LGBMRegressor(
    objective="regression",
    metric="rmse",
    bagging_seed=42,
    verbosity=-1,
    random_state=42
)

# Call the helper function
run_regressor(lgbm_regressor, param_grid_lgbm, "LightGBM Regressor")


--- Cross-Validation Results (LightGBM Regressor) ---
The best parameters are: {'num_leaves': 31, 'n_estimators': 1000, 'learning_rate': 0.1, 'feature_fraction': 0.7, 'bagging_fraction': 0.8}
Mean cross-validation RMSE: 2.0086

--- Test Results ---
Test RMSE: 1.9360
Test MAE: 1.4020
Test R² (Accuracy): 93.47%


In [72]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

param_grid_lr = {'penalty': ['l2'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

run_classifier(lr, param_grid_lr, 'Logistic Regression')


--- Cross-Validation Results (Logistic Regression) ---
The best parameters are: {'solver': 'lbfgs', 'penalty': 'l2'}
Mean cross-validation accuracy: 70.88%

--- Test and Validation Results ---

--- Test Metrics ---
Accuracy: 73.97%
Precision: 72.85%
Recall: 73.97%


In [73]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np

dtree = DecisionTreeClassifier()

param_grid_dtree = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'max_depth': np.arange(1, 20, 2),
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4, 10],
              'max_features': ['auto', 'sqrt', 'log2', None]}

run_classifier(dtree, param_grid_dtree, "Decision Tree")


--- Cross-Validation Results (Decision Tree) ---
The best parameters are: {'splitter': 'random', 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': None, 'max_depth': 11, 'criterion': 'gini'}
Mean cross-validation accuracy: 42.09%

--- Test and Validation Results ---

--- Test Metrics ---
Accuracy: 41.86%
Precision: 40.10%
Recall: 41.86%


In [None]:
dtree_2 = DecisionTreeClassifier()

param_grid_dtree_2 = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Include 'log_loss' for classification
    'splitter': ['best', 'random'],
    'max_depth': np.arange(1, 50, 5),  # Increase depth range
    'min_samples_split': [2, 5, 10, 20],  # Larger values for more generalization
    'min_samples_leaf': [1, 2, 4, 10, 20],  # Larger leaves for pruning
    'max_features': ['auto', 'sqrt', 'log2', None],  # Adjust based on dataset size
    'class_weight': [None, 'balanced']  # Try balancing class weights
}

run_classifier(dtree, param_grid_dtree_2, "Decision Tree")


--- Cross-Validation Results (Decision Tree) ---
The best parameters are: {'splitter': 'random', 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 21, 'criterion': 'gini', 'class_weight': 'balanced'}
Mean cross-validation accuracy: 37.44%

--- Test and Validation Results ---

--- Test Metrics ---
Accuracy: 36.35%
Precision: 42.44%
Recall: 36.35%


In [75]:
dtree_3 = DecisionTreeClassifier()

param_grid_dtree_3 = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': np.arange(1, 30, 2),  # Extend range
    'min_samples_split': [2, 5, 10, 20, 50],  # Include larger splits
    'min_samples_leaf': [1, 2, 4, 10, 20],  # Include larger leaf sizes
    'max_features': [None, 'sqrt', 'log2'],
    'class_weight': [None, 'balanced'],  # Try balancing classes
}

run_classifier(dtree_3, param_grid_dtree_3, "Decision Tree")


--- Cross-Validation Results (Decision Tree) ---
The best parameters are: {'splitter': 'best', 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 27, 'criterion': 'entropy', 'class_weight': 'balanced'}
Mean cross-validation accuracy: 42.25%

--- Test and Validation Results ---

--- Test Metrics ---
Accuracy: 44.87%
Precision: 46.58%
Recall: 44.87%


In [76]:
dtree_3

In [None]:
# from sklearn import tree
# target_names = sorted(set(y_test))
# fig = plt.figure(figsize=(25,20))
# _ = tree.plot_tree(dtree_3,
#                    feature_names=X_train.columns,
#                    class_names=target_names,
#                    filled=True)

In [None]:
dtree_4 = DecisionTreeClassifier()

param_grid_dtree_4 = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Different impurity measures
    'splitter': ['best', 'random'],  # Best or random split selection
    'max_depth': [2, 3, 5, 10, 20],  # Fix range definition
    'min_samples_split': [2, 5, 10, 20, 50, 100],  # Extended to larger values for robustness
    'min_samples_leaf': [1, 5, 10, 20, 50, 100],  # Include smaller leaf sizes for granular splits
    'max_features': [None, 'sqrt', 'log2'],  # Different feature selection strategies
    'class_weight': [None, 'balanced'],  # Account for imbalanced classes
}

run_classifier(dtree_4, param_grid_dtree_4, "Decision Tree")

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

param_grid_rf = {'n_estimators': [100, 200],
              'max_depth': [10, 20, 100, None],
              'max_features': ['auto', 'sqrt', None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4, 10],
              'bootstrap': [True, False],
              'criterion': ['gini', 'entropy']}

run_classifier(rf, param_grid_rf, 'Random Forest')

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()

param_grid = {'hidden_layer_sizes': [(10,), (50,), (10, 10), (50, 50)],
             'activation': ['identity', 'logistic', 'tanh', 'relu'],
             'solver': ['lbfgs', 'sgd', 'adam'],
             'alpha': np.logspace(-5, 3, 5),
             'learning_rate': ['constant', 'invscaling','adaptive'],
             'max_iter': [100, 500, 1000]}

run_classifier(mlp, param_grid, 'Neural Net')