<a href="https://colab.research.google.com/github/FrancoPalavicinoG/cellia/blob/main/notebooks/09_baseline_training_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load Train, Val & Test Datatsets from Google Drive

Mount Drive

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Import libraries

In [32]:
# Core
import pandas as pd
import numpy as np

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Metrics
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, classification_report

# Explainability
import shap
import matplotlib.pyplot as plt
import seaborn as sns

Load Datasets

In [46]:
input_path = "/content/drive/MyDrive/cellia_drive/Datasets/"

Load features

In [47]:
X_train = pd.read_csv(input_path + "X_train_cat.csv")
X_val   = pd.read_csv(input_path + "X_val_cat.csv")
X_test  = pd.read_csv(input_path + "X_test_cat.csv")

Load labels

In [48]:
y_train = pd.read_csv(input_path + "y_train_cat.csv").squeeze()
y_val   = pd.read_csv(input_path + "y_val_cat.csv").squeeze()
y_test  = pd.read_csv(input_path + "y_test_cat.csv").squeeze()

In [49]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(3649, 16) (3649,)
(782, 16) (782,)
(783, 16) (783,)


### Evaluation function

In [50]:
def evaluate_model(model, X, y_true, dataset_name="Validation"):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1]

    roc_auc = roc_auc_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    print(f"{dataset_name} Results:")
    print(f"ROC-AUC: {roc_auc:.3f} | F1: {f1:.3f} | Precision: {precision:.3f} | Recall: {recall:.3f}")
    print("-" * 60)

    return {"roc_auc": roc_auc, "f1": f1, "precision": precision, "recall": recall}

## Model Training

### Logistic Regression (Baseline)

In [51]:
log_reg = LogisticRegression(
    max_iter=500,          # iteration numbers
    random_state=42,
    class_weight='balanced',  # class balancing
    C=1.0,                 # regularization
    penalty='l2',          # penalty type: 'l1', 'l2', 'elasticnet'
    solver='liblinear'     # solver compatible with the penalty
)


### Random Forest

In [52]:
rf = RandomForestClassifier(
    n_estimators=300,      # number of trees
    max_depth=10,          # maximum depth of each tree
    min_samples_split=2,   # minimum samples to split a node
    min_samples_leaf=1,    # minimum samples per leaf
    max_features='sqrt',   # number of features to consider per split
    random_state=42,
    class_weight='balanced'
)


### XGBoost

In [53]:
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)

### LightGBM

In [54]:
lgbm = LGBMClassifier(
    n_estimators=300,      # number of iterations
    max_depth=6,           # maximum tree depth
    num_leaves=31,         # leaves per tree
    learning_rate=0.05,    # learning rate
    min_child_samples=20,  # minimum samples per leaf node
    subsample=0.8,         # fraction of data per tree
    colsample_bytree=0.8,  # fraction of features per tree
    random_state=42,
    scale_pos_weight=1     # class balancing if not using SMOTE
)

### MLP Tabular

In [55]:
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)

### Train models

In [56]:
models = {"Logistic Regression": log_reg,
          "Random Forest": rf,
          "LightGBM": lgbm,
          "MLP": mlp,
          "XGBoost": xgb}

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    results[name] = evaluate_model(model, X_val, y_val, f"Validation ({name})")

Training Logistic Regression...
Validation (Logistic Regression) Results:
ROC-AUC: 0.842 | F1: 0.384 | Precision: 0.258 | Recall: 0.753
------------------------------------------------------------
Training Random Forest...
Validation (Random Forest) Results:
ROC-AUC: 0.824 | F1: 0.374 | Precision: 0.372 | Recall: 0.377
------------------------------------------------------------
Training LightGBM...
[LightGBM] [Info] Number of positive: 359, number of negative: 3290
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000402 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 610
[LightGBM] [Info] Number of data points in the train set: 3649, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.098383 -> initscore=-2.215320
[LightGBM] [Info] Start training from score -2.215320
Validation (LightGBM) Results:
ROC-AUC: 0.80



Validation (MLP) Results:
ROC-AUC: 0.807 | F1: 0.266 | Precision: 0.333 | Recall: 0.221
------------------------------------------------------------
Training XGBoost...
Validation (XGBoost) Results:
ROC-AUC: 0.816 | F1: 0.242 | Precision: 0.545 | Recall: 0.156
------------------------------------------------------------


### Compare models

In [57]:
models = {"Logistic Regression": log_reg,
          "Random Forest": rf,
          "LightGBM": lgbm,
          "MLP": mlp,
          "XGBoost": xgb}

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_val, y_val, f"Validation ({name})")

results_df = pd.DataFrame(results).T
results_df.sort_values("roc_auc", ascending=False)

Validation (Logistic Regression) Results:
ROC-AUC: 0.842 | F1: 0.384 | Precision: 0.258 | Recall: 0.753
------------------------------------------------------------
Validation (Random Forest) Results:
ROC-AUC: 0.824 | F1: 0.374 | Precision: 0.372 | Recall: 0.377
------------------------------------------------------------
Validation (LightGBM) Results:
ROC-AUC: 0.807 | F1: 0.280 | Precision: 0.500 | Recall: 0.195
------------------------------------------------------------
Validation (MLP) Results:
ROC-AUC: 0.807 | F1: 0.266 | Precision: 0.333 | Recall: 0.221
------------------------------------------------------------
Validation (XGBoost) Results:
ROC-AUC: 0.816 | F1: 0.242 | Precision: 0.545 | Recall: 0.156
------------------------------------------------------------


Unnamed: 0,roc_auc,f1,precision,recall
Logistic Regression,0.841632,0.384106,0.257778,0.753247
Random Forest,0.823671,0.374194,0.371795,0.376623
XGBoost,0.816469,0.242424,0.545455,0.155844
LightGBM,0.807332,0.280374,0.5,0.194805
MLP,0.806871,0.265625,0.333333,0.220779


## Hyperparameter Tuning using RandomizedSearchCV

In [58]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
import numpy as np

# ---------------------------
# Logistic Regression
# ---------------------------
logreg = LogisticRegression(max_iter=1000, random_state=42)

logreg_param_dist = {
    'C': uniform(0.01, 10),
    'penalty': ['l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

logreg_search = RandomizedSearchCV(
    logreg,
    param_distributions=logreg_param_dist,
    n_iter=30,
    scoring='f1',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

logreg_search.fit(X_train, y_train)
print("Best Logistic Regression params:", logreg_search.best_params_)
print("Best F1:", logreg_search.best_score_)

# ---------------------------
# Random Forest
# ---------------------------
rf = RandomForestClassifier(random_state=42)

rf_param_dist = {
    'n_estimators': randint(200, 1000),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2', None],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

rf_search = RandomizedSearchCV(
    rf,
    param_distributions=rf_param_dist,
    n_iter=10,
    scoring='f1',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)
print("Best Random Forest params:", rf_search.best_params_)
print("Best F1:", rf_search.best_score_)

# ---------------------------
# LightGBM
# ---------------------------
lgbm = LGBMClassifier(random_state=42)

lgbm_param_dist = {
    'num_leaves': randint(20, 50),
    'max_depth': randint(5, 15),
    'learning_rate': uniform(0.01, 0.1),
    'n_estimators': randint(200, 1000),
    'min_child_samples': randint(10, 50),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'class_weight': [None, 'balanced']
}

lgbm_search = RandomizedSearchCV(
    lgbm,
    param_distributions=lgbm_param_dist,
    n_iter=10,
    scoring='f1',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

lgbm_search.fit(X_train, y_train)
print("Best LightGBM params:", lgbm_search.best_params_)
print("Best F1:", lgbm_search.best_score_)


# ---------------------------
# XGBoost
# ---------------------------
xgb = XGBClassifier(
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

xgb_param_dist = {
    'n_estimators': randint(200, 800),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 2),
    'scale_pos_weight': [1, (y_train.value_counts()[0] / y_train.value_counts()[1])]
}

xgb_search = RandomizedSearchCV(
    xgb,
    param_distributions=xgb_param_dist,
    n_iter=20,
    scoring='f1',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

xgb_search.fit(X_train, y_train)
print("Best XGBoost params:", xgb_search.best_params_)
print("Best F1:", xgb_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Logistic Regression params: {'C': np.float64(0.21584494295802448), 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'}
Best F1: 0.3866140093602422
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Random Forest params: {'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 476}
Best F1: 0.4360216550136854
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 359, number of negative: 3290
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001010 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 610
[LightGBM] [Info] Number of data points in the train set: 3649, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start t

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best XGBoost params: {'colsample_bytree': np.float64(0.6571467271687763), 'gamma': np.float64(3.254442364744264), 'learning_rate': np.float64(0.021282315805420053), 'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 585, 'reg_alpha': np.float64(0.18182496720710062), 'reg_lambda': np.float64(0.36680901970686763), 'scale_pos_weight': np.float64(9.164345403899722), 'subsample': np.float64(0.8446612641953124)}
Best F1: 0.4235846221960191


## Optimize Hyperparameters Models

### Get the best pre-trained models directly from RandomizedSearchCV.

In [59]:
best_logreg = logreg_search.best_estimator_
best_rf = rf_search.best_estimator_
best_lgbm = lgbm_search.best_estimator_
best_xgb = xgb_search.best_estimator_

from sklearn.metrics import classification_report, f1_score

models = {
    "Logistic Regression": best_logreg,
    "Random Forest": best_rf,
    "LightGBM": best_lgbm,
    "XGBoost": best_xgb
}

results = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n=== {name} ===")
    print("F1 Score:", f1_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    results[name] = evaluate_model(model, X_val, y_val, f"Validation ({name})")

results_df = pd.DataFrame(results).T
results_df.sort_values("roc_auc", ascending=False)


=== Logistic Regression ===
F1 Score: 0.379746835443038
              precision    recall  f1-score   support

           0       0.97      0.75      0.84       706
           1       0.25      0.78      0.38        77

    accuracy                           0.75       783
   macro avg       0.61      0.76      0.61       783
weighted avg       0.90      0.75      0.80       783

Validation (Logistic Regression) Results:
ROC-AUC: 0.843 | F1: 0.385 | Precision: 0.259 | Recall: 0.753
------------------------------------------------------------

=== Random Forest ===
F1 Score: 0.3695652173913043
              precision    recall  f1-score   support

           0       0.94      0.90      0.92       706
           1       0.32      0.44      0.37        77

    accuracy                           0.85       783
   macro avg       0.63      0.67      0.64       783
weighted avg       0.88      0.85      0.86       783

Validation (Random Forest) Results:
ROC-AUC: 0.832 | F1: 0.379 | Precisi

Unnamed: 0,roc_auc,f1,precision,recall
Logistic Regression,0.842516,0.385382,0.258929,0.753247
Random Forest,0.831721,0.37931,0.340206,0.428571
XGBoost,0.82614,0.410959,0.316901,0.584416
LightGBM,0.8238,0.394089,0.31746,0.519481


### Export trained models

In [None]:
import joblib

output_path = "/content/drive/MyDrive/cellia_drive/Models/"

import os
os.makedirs(output_path, exist_ok=True)

# Save each model
for name, model in models.items():
    file_path = os.path.join(output_path, f"{name}.joblib")
    joblib.dump(model, file_path)
    print(f"✅ {name} saved to {file_path}")

✅ Logistic Regression saved to /content/drive/MyDrive/cellia_drive/Models/Logistic Regression.joblib
✅ Random Forest saved to /content/drive/MyDrive/cellia_drive/Models/Random Forest.joblib
✅ LightGBM saved to /content/drive/MyDrive/cellia_drive/Models/LightGBM.joblib
✅ XGBoost saved to /content/drive/MyDrive/cellia_drive/Models/XGBoost.joblib
