<a href="https://colab.research.google.com/github/FrancoPalavicinoG/cellia/blob/main/notebooks/04_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load Train, Val & Test Datatsets from Google Drive

Mount Drive

In [121]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Import libraries

In [122]:
# Core
import pandas as pd
import numpy as np

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Metrics
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, classification_report

# Explainability
import shap
import matplotlib.pyplot as plt
import seaborn as sns

Load Datasets

In [123]:
input_path = "/content/drive/MyDrive/cellia_drive/Datasets/"

Load features

In [124]:
X_train = pd.read_csv(input_path + "X_train_res.csv")
X_val   = pd.read_csv(input_path + "X_val.csv")
X_test  = pd.read_csv(input_path + "X_test.csv")

Load labels

In [125]:
y_train = pd.read_csv(input_path + "y_train_res.csv").squeeze()
y_val   = pd.read_csv(input_path + "y_val.csv").squeeze()
y_test  = pd.read_csv(input_path + "y_test.csv").squeeze()

In [126]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(6580, 15) (6580,)
(782, 15) (782,)
(783, 15) (783,)


### Evaluation function

In [127]:
def evaluate_model(model, X, y_true, dataset_name="Validation"):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1]

    roc_auc = roc_auc_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    print(f"{dataset_name} Results:")
    print(f"ROC-AUC: {roc_auc:.3f} | F1: {f1:.3f} | Precision: {precision:.3f} | Recall: {recall:.3f}")
    print("-" * 60)

    return {"roc_auc": roc_auc, "f1": f1, "precision": precision, "recall": recall}

## Model Training

### Logistic Regression (Baseline)

In [128]:
log_reg = LogisticRegression(max_iter=500, random_state=42)
log_reg.fit(X_train, y_train)

evaluate_model(log_reg, X_val, y_val, "Validation (LogReg)")

Validation (LogReg) Results:
ROC-AUC: 0.851 | F1: 0.405 | Precision: 0.276 | Recall: 0.766
------------------------------------------------------------


{'roc_auc': np.float64(0.8510085659021829),
 'f1': 0.4054982817869416,
 'precision': 0.2757009345794392,
 'recall': 0.7662337662337663}

### Random Forest

In [129]:
rf = RandomForestClassifier(n_estimators=300, max_depth=8, random_state=42)
rf.fit(X_train, y_train)

evaluate_model(rf, X_val, y_val, "Validation (Random Forest)")

Validation (Random Forest) Results:
ROC-AUC: 0.831 | F1: 0.396 | Precision: 0.303 | Recall: 0.571
------------------------------------------------------------


{'roc_auc': np.float64(0.8310767246937459),
 'f1': 0.3963963963963964,
 'precision': 0.30344827586206896,
 'recall': 0.5714285714285714}

### XGBoost

In [130]:
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)
xgb.fit(X_train, y_train)

evaluate_model(xgb, X_val, y_val, "Validation (XGBoost)")

Validation (XGBoost) Results:
ROC-AUC: 0.798 | F1: 0.400 | Precision: 0.397 | Recall: 0.403
------------------------------------------------------------


{'roc_auc': np.float64(0.7978815510730405),
 'f1': 0.4,
 'precision': 0.3974358974358974,
 'recall': 0.4025974025974026}

### LightGBM

In [131]:
lgbm = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)
lgbm.fit(X_train, y_train)

evaluate_model(lgbm, X_val, y_val, "Validation (LightGBM)")

[LightGBM] [Info] Number of positive: 3290, number of negative: 3290
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 6580, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Validation (LightGBM) Results:
ROC-AUC: 0.815 | F1: 0.403 | Precision: 0.390 | Recall: 0.416
------------------------------------------------------------


{'roc_auc': np.float64(0.81530809615916),
 'f1': 0.4025157232704403,
 'precision': 0.3902439024390244,
 'recall': 0.4155844155844156}

### MLP Tabular

In [132]:
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

evaluate_model(mlp, X_val, y_val, "Validation (MLP)")

Validation (MLP) Results:
ROC-AUC: 0.732 | F1: 0.349 | Precision: 0.316 | Recall: 0.390
------------------------------------------------------------




{'roc_auc': np.float64(0.7323570046974303),
 'f1': 0.3488372093023256,
 'precision': 0.3157894736842105,
 'recall': 0.38961038961038963}

## Compare models

In [133]:
models = {
    "Logistic Regression": log_reg,
    "Random Forest": rf,
    "XGBoost": xgb,
    "LightGBM": lgbm,
    "MLP": mlp
}

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_val, y_val, f"Validation ({name})")

results_df = pd.DataFrame(results).T
results_df.sort_values("roc_auc", ascending=False)

Validation (Logistic Regression) Results:
ROC-AUC: 0.851 | F1: 0.405 | Precision: 0.276 | Recall: 0.766
------------------------------------------------------------
Validation (Random Forest) Results:
ROC-AUC: 0.831 | F1: 0.396 | Precision: 0.303 | Recall: 0.571
------------------------------------------------------------
Validation (XGBoost) Results:
ROC-AUC: 0.798 | F1: 0.400 | Precision: 0.397 | Recall: 0.403
------------------------------------------------------------
Validation (LightGBM) Results:
ROC-AUC: 0.815 | F1: 0.403 | Precision: 0.390 | Recall: 0.416
------------------------------------------------------------
Validation (MLP) Results:
ROC-AUC: 0.732 | F1: 0.349 | Precision: 0.316 | Recall: 0.390
------------------------------------------------------------


Unnamed: 0,roc_auc,f1,precision,recall
Logistic Regression,0.851009,0.405498,0.275701,0.766234
Random Forest,0.831077,0.396396,0.303448,0.571429
LightGBM,0.815308,0.402516,0.390244,0.415584
XGBoost,0.797882,0.4,0.397436,0.402597
MLP,0.732357,0.348837,0.315789,0.38961
