<a href="https://colab.research.google.com/github/FrancoPalavicinoG/cellia/blob/main/notebooks/07_ensemble_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load Train, Val & Test Datatsets from Google Drive

Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import libraries

In [2]:
# Core
import pandas as pd
import numpy as np

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Metrics
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, classification_report

# Explainability
import shap
import matplotlib.pyplot as plt
import seaborn as sns

Load Datasets

In [3]:
input_path = "/content/drive/MyDrive/cellia_drive/Datasets/"

Load features

In [4]:
X_train = pd.read_csv(input_path + "X_train_v2.csv")
X_val   = pd.read_csv(input_path + "X_val_v2.csv")
X_test  = pd.read_csv(input_path + "X_test_v2.csv")

Load labels

In [5]:
y_train = pd.read_csv(input_path + "y_train_v2.csv").squeeze()
y_val   = pd.read_csv(input_path + "y_val_v2.csv").squeeze()
y_test  = pd.read_csv(input_path + "y_test_v2.csv").squeeze()

In [6]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(49000, 36) (49000,)
(10500, 36) (10500,)
(10500, 36) (10500,)


## Ensemble approach

libraries

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, f1_score, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import joblib

Models

In [8]:
rf = RandomForestClassifier(
    n_estimators=271,      # number of trees
    max_depth=8,          # maximum depth of each tree
    min_samples_split=4,   # minimum samples to split a node
    min_samples_leaf=3,    # minimum samples per leaf
    max_features='sqrt',   # number of features to consider per split
    random_state=42,
    class_weight='balanced_subsample'
)

xgb = XGBClassifier(
    n_estimators=240,
    learning_rate=np.float64(0.16425406933718917),
    max_depth=7,
    subsample=np.float64(0.7323592099410596),
    colsample_bytree=np.float64(0.8827429375390468),
    random_state=42,
    eval_metric='logloss',
    gamma=np.float64(3.6450358402049368),
    reg_alpha=np.float64(0.9149596755437808),
    reg_lambda=np.float64(1.7000771555795986),
    scale_pos_weight=np.float64(1.498607242339833),
)

logreg = LogisticRegression(
    max_iter=500,
    random_state=42,
    class_weight='balanced',
    C=np.float64(0.21584494295802448),
    penalty='l2',
    solver='saga'
)

Ensembles

In [9]:
ensemble_A = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb)],
    voting='soft'
)

ensemble_B = VotingClassifier(
    estimators=[('logreg', logreg), ('xgb', xgb)],
    voting='soft'
)

ensemble_C = VotingClassifier(
    estimators=[('logreg', logreg), ('xgb', xgb), ('rf', rf)],
    voting='soft'
)

Training

In [10]:
ensemble_A.fit(X_train, y_train)
ensemble_B.fit(X_train, y_train)
ensemble_C.fit(X_train, y_train)



In [11]:
def evaluate_model(model, X, y_true, dataset_name="Validation"):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1]

    roc_auc = roc_auc_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    print(f"{dataset_name} Results:")
    print(f"ROC-AUC: {roc_auc:.3f} | F1: {f1:.3f} | Precision: {precision:.3f} | Recall: {recall:.3f}")
    print("-" * 60)

    return {"roc_auc": roc_auc, "f1": f1, "precision": precision, "recall": recall}

In [12]:
models = {"Ensemble A": ensemble_A,
          "Ensemble B": ensemble_B,
          "Ensemble C": ensemble_C}

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_val, y_val, f"Validation ({name})")

results_df = pd.DataFrame(results).T
results_df.sort_values("roc_auc", ascending=False)

Validation (Ensemble A) Results:
ROC-AUC: 0.800 | F1: 0.733 | Precision: 0.730 | Recall: 0.736
------------------------------------------------------------
Validation (Ensemble B) Results:
ROC-AUC: 0.791 | F1: 0.734 | Precision: 0.701 | Recall: 0.771
------------------------------------------------------------
Validation (Ensemble C) Results:
ROC-AUC: 0.797 | F1: 0.731 | Precision: 0.723 | Recall: 0.739
------------------------------------------------------------


Unnamed: 0,roc_auc,f1,precision,recall
Ensemble A,0.799983,0.733175,0.730333,0.73604
Ensemble C,0.796714,0.730972,0.722667,0.73947
Ensemble B,0.791208,0.734349,0.700779,0.771298
