In [None]:
import time
import warnings
import sys
import copy # For saving best model state
from typing import List, Tuple, Dict, Any, Optional

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.svm import SVC

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score
)

import matplotlib.pyplot as plt
import seaborn as sns


# Fixed Hyperparameters

In [None]:
RANDOM_STATE: int = 42
PCA_EXPLAINED_VARIANCE_THRESHOLD: float = 0.95
NUM_CLASSES: int = 5
N_JOBS: int = -1
N_ITER_SEARCH: int = 50 # Number of parameter settings sampled per model (adjust based on time/resources)
CV_FOLDS: int = 3 # Number of cross-validation folds for search
SCORING_METRIC: str = 'f1_weighted' # Metric to optimize during search

In [None]:
DEVICE = torch.device("mps")
EPOCHS: int = 50
BATCH_SIZE: int = 128
LEARNING_RATE: float = 1e-3
EARLY_STOPPING_PATIENCE: int = 5
DEVICE

device(type='mps')

In [None]:
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)

# Load Dataset

In [None]:
X_train = pd.read_csv("./Oversampled_X_train.csv")
y_train = pd.read_csv("./Oversampled_y_train.csv")
X_val = pd.read_csv("./X_val.csv")
y_val = pd.read_csv("./y_val.csv")
X_test = pd.read_csv("./X_test.csv")
y_test = pd.read_csv("./y_test.csv")

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

(269365, 100)
(269365, 1)
(25195, 100)
(25195, 1)
(22543, 100)
(22543, 1)


# Encode Target Variables

In [None]:
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_val_enc = label_encoder.transform(y_val)
y_test_enc = label_encoder.transform(y_test)
num_classes = len(label_encoder.classes_)
target_names = label_encoder.classes_.tolist()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


# Dimension Reduction using PCA (we save 95% of variance)

In [None]:
pca = PCA(n_components=PCA_EXPLAINED_VARIANCE_THRESHOLD, random_state=RANDOM_STATE)

X_train_np = X_train.values
X_val_np = X_val.values
X_test_np = X_test.values

X_train_pca = pca.fit_transform(X_train_np)
X_val_pca = pca.transform(X_val_np)
X_test_pca = pca.transform(X_test_np)

In [None]:
print("After Dimension reduction")
print("Train data shape:",X_train_pca.shape)
print("Validation data shape:",X_val_pca.shape)
print("Test data shape:",X_test_pca.shape)

After Dimension reduction
Train data shape: (269365, 15)
Validation data shape: (25195, 15)
Test data shape: (22543, 15)


## Comment
Before Dimension Reduction we had 100 features as input, now we have 15 features

# Common Functions

In [None]:
def evaluate_classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, target_names: List[str]) -> Dict[str, Any]:
    """Calculates and formats standard classification metrics."""
    accuracy = accuracy_score(y_true, y_pred)
    # Using weighted average for precision, recall, f1 due to potential imbalance even after resampling
    precision_w = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall_w = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1_w = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    report = classification_report(y_true, y_pred, target_names=target_names, zero_division=0, output_dict=True)

    metrics = {
        'accuracy': accuracy,
        'precision_weighted': precision_w,
        'recall_weighted': recall_w,
        'f1_weighted': f1_w,
        'classification_report': report # Store full report dict
    }
    return metrics

In [None]:
def train_evaluate_model_with_search(
    model_name: str, base_estimator: Any, param_distributions: Dict,
    X_train: np.ndarray, y_train: np.ndarray,
    X_val: np.ndarray, y_val: np.ndarray,
    X_test: np.ndarray, y_test: np.ndarray,
    target_names: List[str],
    n_iter: int = N_ITER_SEARCH,
    cv: int = CV_FOLDS,
    scoring_metric: str = SCORING_METRIC
) -> Dict[str, Any]:


    results = {'model_name': model_name}

    if hasattr(base_estimator, 'random_state'):
        try: base_estimator.set_params(random_state=RANDOM_STATE)
        except:
            pass
    if 'n_jobs' in base_estimator.get_params():
         try: base_estimator.set_params(n_jobs=N_JOBS)
         except: pass

    random_search = RandomizedSearchCV(
        estimator=base_estimator,
        param_distributions=param_distributions,
        n_iter=n_iter,
        cv=cv,
        scoring=scoring_metric,
        n_jobs=N_JOBS,
        random_state=RANDOM_STATE,
        verbose=1,
        error_score='raise'
    )

    start_time = time.time()
    try:

        random_search.fit(X_train, y_train)

    except Exception as e:

        results['search_time_s'] = -1
        results['error'] = f"Search Failed: {e}"
        return results

    search_time = time.time() - start_time
    results['search_time_s'] = round(search_time, 3)


    best_model = random_search.best_estimator_
    results['best_params'] = random_search.best_params_
    results['best_cv_score'] = random_search.best_score_


    results['final_fit_time_s'] = 'N/A (Included in Search Time)'


    try:
        y_val_pred = best_model.predict(X_val)
        val_metrics = evaluate_classification_metrics(y_val, y_val_pred, target_names)
        results['val_metrics'] = val_metrics
        #log_message(f"Validation Metrics (Best Model): Acc={val_metrics['accuracy']:.4f}, F1-W={val_metrics['f1_weighted']:.4f}")
    except Exception as e:

        results['val_metrics'] = None


    try:
        y_test_pred = best_model.predict(X_test)
        test_metrics = evaluate_classification_metrics(y_test, y_test_pred, target_names)
        results['test_metrics'] = test_metrics
        #log_message(f"Test Metrics (Best Model): Acc={test_metrics['accuracy']:.4f}, F1-W={test_metrics['f1_weighted']:.4f}")
    except Exception as e:
        results['test_metrics'] = None

    return results


# Hyperparameter Candidates to Search for Each Model

In [None]:
# Random Forest Model
param_dist_rf = {
    'n_estimators': randint(100, 500),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 11),
    'max_features': ['sqrt', 'log2', None]
}

#XGBoost Classifier
param_dist_xgb = {
    'n_estimators': randint(100, 500),
    'learning_rate': loguniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': [0, 1, 5]
}

#Light GBM
param_dist_lgbm = {
    'n_estimators': randint(100, 500),
    'learning_rate': loguniform(0.01, 0.3),
    'num_leaves': randint(20, 60),
    'max_depth': [-1, 10, 20, 30],
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': loguniform(1e-3, 1.0),
    'reg_lambda': loguniform(1e-3, 1.0)
}

#Explainable Boosting Machine
param_dist_ebm = {
    'learning_rate': loguniform(0.01, 0.2),
    'max_leaves': randint(2, 10),
}

#SVM with RBF Kernel
param_dist_svm = {
    'C': loguniform(0.1, 100),
    'gamma': loguniform(1e-4, 1e-1)
}

# ML Models

In [None]:
all_results = []
models_to_search = [
    ("RandomForest", RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced'), param_dist_rf),
    ("XGBoost", XGBClassifier(objective='multi:softmax', num_class=NUM_CLASSES, eval_metric='mlogloss', use_label_encoder=False, random_state=RANDOM_STATE), param_dist_xgb),
    ("LightGBM", LGBMClassifier(objective='multiclass', num_class=NUM_CLASSES, metric='multi_logloss', class_weight='balanced', random_state=RANDOM_STATE), param_dist_lgbm),
    ("EBM", ExplainableBoostingClassifier(random_state=RANDOM_STATE), param_dist_ebm),
    ("SVM_RBF", SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE, class_weight='balanced'), param_dist_svm)
]

In [None]:
from tqdm.notebook import tqdm
for name, base_model, params in tqdm(models_to_search, desc = "Tuning all models.."):
    model_results = train_evaluate_model_with_search(
        name, base_model, params,
        X_train_pca, y_train.values,
        X_val_pca, y_val.values,
        X_test_pca, y_test_enc,
        target_names
    )
    all_results.append(model_results)

Tuning all models..:   0%|          | 0/5 [00:00<?, ?it/s]

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **