<a href="https://colab.research.google.com/github/FrancoPalavicinoG/cellia/blob/main/notebooks/04_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load Train, Val & Test Datatsets from Google Drive

Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import libraries

In [6]:
# Core
import pandas as pd
import numpy as np

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Metrics
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, classification_report

# Explainability
import shap
import matplotlib.pyplot as plt
import seaborn as sns

Load Datasets

In [3]:
input_path = "/content/drive/MyDrive/cellia_drive/Datasets/"

In [4]:
train_df = pd.read_csv(input_path + "train_preprocessed.csv")
val_df   = pd.read_csv(input_path + "val_preprocessed.csv")
test_df  = pd.read_csv(input_path + "test_preprocessed.csv")

In [5]:
print(train_df.shape, val_df.shape, test_df.shape)
train_df.head()

(6580, 20) (782, 20) (783, 20)


Unnamed: 0,GENDER,AGE_YEARS,BODY_MASS_INDEX,HEIGHT,WEIGHT,TOTAL_CHOL,CHOLESTEROL,CREATININE,TRIGLYCERIDES_R,LDL,TRIGLYCERIDE,HDL,HYPERTENSION,HIGH_CHOLESTEROL,HEART_ATTACK_RELATIVES,LDL_to_HDL_ratio,Chol_HDL_ratio,Triglyceride_HDL_ratio,Metabolic_risk_score,HEART_CONDITION
0,1,-0.84477,0.034021,1.202104,0.591362,0.519477,0.59506,0.432348,0.126008,1.013026,0.16008,-0.740773,0,0,0,1.237364,0.934236,0.171611,-1.112044,0
1,0,-0.560542,-0.904199,-1.139525,-1.216905,0.395702,0.5709,-0.30991,-0.851787,0.235844,-0.702777,1.645664,0,0,0,-0.846081,-0.818801,-0.656841,-1.112044,0
2,1,0.974289,0.2899,2.048239,1.296358,-1.089605,-0.975335,0.311516,-0.449166,-1.353857,-0.344177,0.201242,1,1,1,-1.080757,-0.839747,-0.382656,1.941266,0
3,1,1.315362,-0.491951,-0.627909,-0.699603,-0.743033,-0.733736,-0.292648,-0.744969,-0.427042,-1.11341,0.578048,1,0,0,-0.742762,-0.907465,-0.82036,-0.094274,0
4,0,-0.219468,-0.164995,0.090322,-0.118211,0.668008,0.71586,-0.413481,2.114467,-0.105116,1.934367,-1.054778,0,1,0,0.771897,1.53393,1.877839,-0.094274,0


In [7]:
target = "HEART_CONDITION"
features = [col for col in train_df.columns if col != target]

X_train, y_train = train_df[features], train_df[target]
X_val,   y_val   = val_df[features],   val_df[target]
X_test,  y_test  = test_df[features],  test_df[target]

### Evaluation function

In [8]:
def evaluate_model(model, X, y_true, dataset_name="Validation"):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1]

    roc_auc = roc_auc_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    print(f"{dataset_name} Results:")
    print(f"ROC-AUC: {roc_auc:.3f} | F1: {f1:.3f} | Precision: {precision:.3f} | Recall: {recall:.3f}")
    print("-" * 60)

    return {"roc_auc": roc_auc, "f1": f1, "precision": precision, "recall": recall}

## Model Training

### Logistic Regression (Baseline)

In [9]:
log_reg = LogisticRegression(max_iter=500, random_state=42)
log_reg.fit(X_train, y_train)

evaluate_model(log_reg, X_val, y_val, "Validation (LogReg)")

Validation (LogReg) Results:
ROC-AUC: 0.824 | F1: 0.369 | Precision: 0.249 | Recall: 0.714
------------------------------------------------------------


{'roc_auc': np.float64(0.8242055816523901),
 'f1': 0.3691275167785235,
 'precision': 0.248868778280543,
 'recall': 0.7142857142857143}

### Random Forest

In [10]:
rf = RandomForestClassifier(n_estimators=300, max_depth=8, random_state=42)
rf.fit(X_train, y_train)

evaluate_model(rf, X_val, y_val, "Validation (Random Forest)")

Validation (Random Forest) Results:
ROC-AUC: 0.827 | F1: 0.383 | Precision: 0.282 | Recall: 0.597
------------------------------------------------------------


{'roc_auc': np.float64(0.8266740351846734),
 'f1': 0.38333333333333336,
 'precision': 0.2822085889570552,
 'recall': 0.5974025974025974}

### XGBoost

In [11]:
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)
xgb.fit(X_train, y_train)

evaluate_model(xgb, X_val, y_val, "Validation (XGBoost)")

Validation (XGBoost) Results:
ROC-AUC: 0.816 | F1: 0.420 | Precision: 0.412 | Recall: 0.429
------------------------------------------------------------


{'roc_auc': np.float64(0.8163212673850971),
 'f1': 0.42038216560509556,
 'precision': 0.4125,
 'recall': 0.42857142857142855}

### LightGBM

In [12]:
lgbm = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)
lgbm.fit(X_train, y_train)

evaluate_model(lgbm, X_val, y_val, "Validation (LightGBM)")

[LightGBM] [Info] Number of positive: 3290, number of negative: 3290
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000901 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3833
[LightGBM] [Info] Number of data points in the train set: 6580, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Validation (LightGBM) Results:
ROC-AUC: 0.808 | F1: 0.380 | Precision: 0.360 | Recall: 0.403
------------------------------------------------------------


{'roc_auc': np.float64(0.8082895827576678),
 'f1': 0.3803680981595092,
 'precision': 0.36046511627906974,
 'recall': 0.4025974025974026}

### MLP Tabular

In [13]:
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

evaluate_model(mlp, X_val, y_val, "Validation (MLP)")

Validation (MLP) Results:
ROC-AUC: 0.734 | F1: 0.347 | Precision: 0.286 | Recall: 0.442
------------------------------------------------------------




{'roc_auc': np.float64(0.7339780786589297),
 'f1': 0.3469387755102041,
 'precision': 0.2857142857142857,
 'recall': 0.44155844155844154}

## Compare models

In [14]:
models = {
    "Logistic Regression": log_reg,
    "Random Forest": rf,
    "XGBoost": xgb,
    "LightGBM": lgbm,
    "MLP": mlp
}

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_val, y_val, f"Validation ({name})")

results_df = pd.DataFrame(results).T
results_df.sort_values("roc_auc", ascending=False)

Validation (Logistic Regression) Results:
ROC-AUC: 0.824 | F1: 0.369 | Precision: 0.249 | Recall: 0.714
------------------------------------------------------------
Validation (Random Forest) Results:
ROC-AUC: 0.827 | F1: 0.383 | Precision: 0.282 | Recall: 0.597
------------------------------------------------------------
Validation (XGBoost) Results:
ROC-AUC: 0.816 | F1: 0.420 | Precision: 0.412 | Recall: 0.429
------------------------------------------------------------
Validation (LightGBM) Results:
ROC-AUC: 0.808 | F1: 0.380 | Precision: 0.360 | Recall: 0.403
------------------------------------------------------------
Validation (MLP) Results:
ROC-AUC: 0.734 | F1: 0.347 | Precision: 0.286 | Recall: 0.442
------------------------------------------------------------


Unnamed: 0,roc_auc,f1,precision,recall
Random Forest,0.826674,0.383333,0.282209,0.597403
Logistic Regression,0.824206,0.369128,0.248869,0.714286
XGBoost,0.816321,0.420382,0.4125,0.428571
LightGBM,0.80829,0.380368,0.360465,0.402597
MLP,0.733978,0.346939,0.285714,0.441558
