<a href="https://colab.research.google.com/github/FrancoPalavicinoG/cellia/blob/main/notebooks/05_preprocessing_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load Train, Val & Test Datatsets from Google Drive

Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import libraries

In [2]:
# Core
import pandas as pd
import numpy as np

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Metrics
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, classification_report

# Explainability
import shap
import matplotlib.pyplot as plt
import seaborn as sns

Load Datasets

In [3]:
input_path = "/content/drive/MyDrive/cellia_drive/Datasets/"

Load features

In [4]:
X_train = pd.read_csv(input_path + "X_train.csv")
X_val   = pd.read_csv(input_path + "X_val.csv")
X_test  = pd.read_csv(input_path + "X_test.csv")

Load labels

In [5]:
y_train = pd.read_csv(input_path + "y_train.csv").squeeze()
y_val   = pd.read_csv(input_path + "y_val.csv").squeeze()
y_test  = pd.read_csv(input_path + "y_test.csv").squeeze()

In [6]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(3649, 25) (3649,)
(782, 25) (782,)
(783, 25) (783,)


### Preprocessing Experiments

Imports

In [10]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, f1_score, recall_score, roc_auc_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

Models & Results

In [None]:
models = {
    "LogReg": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=300, random_state=42)
}

# Loop through models and store results
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]

    results[name] = {
        "ROC-AUC": roc_auc_score(y_val, y_proba),
        "F1": f1_score(y_val, y_pred),
        "Recall": recall_score(y_val, y_pred),
        "Precision": precision_score(y_val, y_pred)
    }

pd.DataFrame(results).T.sort_values(by="F1", ascending=False)

numeric_cols = X_train.columns.tolist()  # or the list of features you want to try

 Try several k values to see how performance changes

In [13]:
numeric_cols = X_train.columns.tolist()

k_values = [5, 10, 15, 20, len(numeric_cols)]

results_all = {}

scoring = {
    'roc_auc': 'roc_auc',
    'f1': 'f1',
    'recall_pos': make_scorer(recall_score, pos_label=1)
}

for model_name, model in models.items():
    print(f"\n===== Evaluating {model_name} =====")
    model_results = {}

    for k in k_values:
        selector = SelectKBest(score_func=f_classif, k=k)
        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("select", selector),
            ("clf", model)
        ])

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_res = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)

        # Store mean of each metric
        model_results[k] = {metric: np.mean(cv_res[f"test_{metric}"]) for metric in scoring.keys()}
        print(f"k={k} -> {model_results[k]}")

    results_all[model_name] = model_results


===== Evaluating LogReg =====
k=5 -> {'roc_auc': np.float64(0.8261169261431472), 'f1': np.float64(0.3660077748099011), 'recall_pos': np.float64(0.7884194053208139)}
k=10 -> {'roc_auc': np.float64(0.8454366863117239), 'f1': np.float64(0.39077391718826154), 'recall_pos': np.float64(0.7800078247261346)}
k=15 -> {'roc_auc': np.float64(0.8459639634497291), 'f1': np.float64(0.39196660161069274), 'recall_pos': np.float64(0.7827856025039124)}
k=20 -> {'roc_auc': np.float64(0.8449116091347136), 'f1': np.float64(0.3895683120027583), 'recall_pos': np.float64(0.7716744913928013)}
k=25 -> {'roc_auc': np.float64(0.8434964990890972), 'f1': np.float64(0.39042916101512953), 'recall_pos': np.float64(0.7744913928012519)}

===== Evaluating RandomForest =====
k=5 -> {'roc_auc': np.float64(0.778327381546965), 'f1': np.float64(0.158418003910413), 'recall_pos': np.float64(0.11146322378716744)}
k=10 -> {'roc_auc': np.float64(0.8278797192136269), 'f1': np.float64(0.2686846719804249), 'recall_pos': np.float64(0

Display results

In [14]:
flat_results = []
for model_name, res_k in results_all.items():
    for k, metrics in res_k.items():
        row = {'Model': model_name, 'k': k}
        row.update(metrics)
        flat_results.append(row)

results_df = pd.DataFrame(flat_results)
results_df = results_df.sort_values(by=["roc_auc"], ascending=False).reset_index(drop=True)

print("\n=== Summary (sorted by ROC-AUC) ===")
display(results_df)


=== Summary (sorted by ROC-AUC) ===


Unnamed: 0,Model,k,roc_auc,f1,recall_pos
0,LogReg,15,0.845964,0.391967,0.782786
1,LogReg,10,0.845437,0.390774,0.780008
2,LogReg,20,0.844912,0.389568,0.771674
3,LogReg,25,0.843496,0.390429,0.774491
4,RandomForest,10,0.82788,0.268685,0.181221
5,RandomForest,20,0.827256,0.236794,0.153404
6,RandomForest,15,0.826661,0.21285,0.136502
7,LogReg,5,0.826117,0.366008,0.788419
8,RandomForest,25,0.82235,0.228653,0.145031
9,LightGBM,15,0.807998,0.298799,0.225665


Use L1-penalized logistic to select

In [15]:
from sklearn.feature_selection import SelectFromModel

l1_selector = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("l1", SelectFromModel(LogisticRegression(penalty="l1", solver="saga", C=1.0, max_iter=2000, random_state=42)))
])

l1_selector.fit(X_train, y_train)
mask = l1_selector.named_steps['l1'].get_support()
selected_features_l1 = X_train.columns[mask]
print("Selected by L1:", selected_features_l1.tolist())

Selected by L1: ['GENDER', 'AGE_YEARS', 'BODY_MASS_INDEX', 'HEIGHT', 'WEIGHT', 'CHOLESTEROL', 'CREATININE', 'TRIGLYCERIDES_R', 'LDL', 'TRIGLYCERIDE', 'HDL', 'HYPERTENSION', 'HIGH_CHOLESTEROL', 'HEART_ATTACK_RELATIVES', 'LDL_to_HDL_ratio', 'Triglyceride_HDL_ratio', 'Metabolic_risk_score', 'log_TRIGLYCERIDE', 'log_LDL_to_HDL_ratio', 'BMI_squared', 'Age_BMI_interaction', 'LDL_trig_interaction', 'Creatinine_age_ratio']


Train a RF on full features first (with imputation/scaling pipeline)

In [16]:
from sklearn.inspection import permutation_importance

pipe_full = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=300, random_state=42))
])
pipe_full.fit(X_train, y_train)

perm = permutation_importance(pipe_full, X_train, y_train, n_repeats=10, random_state=42, n_jobs=-1)
imp_df = pd.DataFrame({"feature": X_train.columns, "importance": perm.importances_mean})
imp_df = imp_df.sort_values("importance", ascending=False)
print(imp_df.head(30))

                   feature  importance
1                AGE_YEARS    0.042806
22     Age_BMI_interaction    0.025377
12            HYPERTENSION    0.016662
7               CREATININE    0.015347
18    Metabolic_risk_score    0.014250
13        HIGH_CHOLESTEROL    0.013182
24    Creatinine_age_ratio    0.008112
14  HEART_ATTACK_RELATIVES    0.006961
6              CHOLESTEROL    0.005536
9                      LDL    0.004330
3                   HEIGHT    0.002686
21             BMI_squared    0.002247
4                   WEIGHT    0.002220
5               TOTAL_CHOL    0.001973
2          BODY_MASS_INDEX    0.001836
8          TRIGLYCERIDES_R    0.001836
11                     HDL    0.001507
19        log_TRIGLYCERIDE    0.000822
15        LDL_to_HDL_ratio    0.000795
17  Triglyceride_HDL_ratio    0.000493
16          Chol_HDL_ratio    0.000493
20    log_LDL_to_HDL_ratio    0.000438
23    LDL_trig_interaction    0.000137
0                   GENDER    0.000000
10            TRIGLYCERID

### Feature Selection k=15

In [18]:
top15_features = [
    'AGE_YEARS',
    'Age_BMI_interaction',
    'HYPERTENSION',
    'CREATININE',
    'Metabolic_risk_score',
    'HIGH_CHOLESTEROL',
    'Creatinine_age_ratio',
    'HEART_ATTACK_RELATIVES',
    'CHOLESTEROL',
    'LDL',
    'HEIGHT',
    'BMI_squared',
    'WEIGHT',
    'TOTAL_CHOL',
    'BODY_MASS_INDEX'
]

In [19]:
X_train_k15 = X_train[top15_features].copy()
X_val_k15 = X_val[top15_features].copy()
X_test_k15 = X_test[top15_features].copy()

### Undersampling of class 0  

Original distribution & actual ratio

In [25]:
import numpy as np

unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))

minority_count = counts[1]
majority_target = int(minority_count * (70 / 30))  # 70/30 ratio
print(f"Goal: {majority_target} in class 0.")

{np.int64(0): np.int64(3290), np.int64(1): np.int64(359)}
Goal: 837 in class 0.


Maintain 30% of min class (class 1)

In [26]:
from imblearn.under_sampling import RandomUnderSampler

sampling_strategy = {0: majority_target, 1: minority_count}

rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)

X_train_res, y_train_res = rus.fit_resample(X_train_k15, y_train)

print("Original distribution:", dict(zip(unique, counts)))
print("Distribution after undersampling:", dict(zip(*np.unique(y_train_res, return_counts=True))))

Original distribution: {np.int64(0): np.int64(3290), np.int64(1): np.int64(359)}
Distribution after undersampling: {np.int64(0): np.int64(837), np.int64(1): np.int64(359)}


#### Save preprocessed datasets

Save datasets

In [27]:
output_path = "/content/drive/MyDrive/cellia_drive/Datasets/"

X_train_res.to_csv(output_path + "X_train.csv", index=False)
y_train_res.to_csv(output_path + "y_train.csv", index=False)

X_val_k15.to_csv(output_path + "X_val.csv", index=False)
y_val.to_csv(output_path + "y_val.csv", index=False)

X_test_k15.to_csv(output_path + "X_test.csv", index=False)
y_test.to_csv(output_path + "y_test.csv", index=False)

In [28]:
print("✅ Files saved:")
print(f"   - {output_path}X_train_res.csv")
print(f"   - {output_path}y_train_res.csv")
print(f"   - {output_path}X_val.csv")
print(f"   - {output_path}y_val.csv")
print(f"   - {output_path}X_test.csv")
print(f"   - {output_path}y_test.csv")

✅ Files saved:
   - /content/drive/MyDrive/cellia_drive/Datasets/X_train_res.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/y_train_res.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/X_val.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/y_val.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/X_test.csv
   - /content/drive/MyDrive/cellia_drive/Datasets/y_test.csv
