In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from catboost import CatBoostClassifier
import warnings
import os

warnings.filterwarnings('ignore')

print("Libraries loaded.")

Libraries loaded.


In [2]:
# Load Real Data (MICE)
df_real = pd.read_csv('../data/processed/uc_diagnostic_tests_mice.csv')

target_col = 'mayo'

X = df_real.drop(columns=[target_col])
y = df_real[target_col]

X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Total Real Patients: {len(df_real)}")
print(f"Test Set (Hidden): {len(X_test_real)} patients")

Total Real Patients: 252
Test Set (Hidden): 51 patients


In [3]:
def evaluate_models(train_df, dataset_name):
    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col]

    # 1. Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    rf_acc = accuracy_score(y_test_real, rf.predict(X_test_real))

    # 2. CatBoost
    cb = CatBoostClassifier(verbose=0, random_state=42, allow_writing_files=False)
    cb.fit(X_train, y_train)
    cb_acc = accuracy_score(y_test_real, cb.predict(X_test_real))

    # 3. Stacking (RF + CatBoost)
    estimators = [('rf', rf), ('cb', cb)]
    stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
    stack.fit(X_train, y_train)
    stack_acc = accuracy_score(y_test_real, stack.predict(X_test_real))

    return {
        'Dataset': dataset_name,
        'RF Accuracy': round(rf_acc, 4),
        'CatBoost Accuracy': round(cb_acc, 4),
        'Stacking Accuracy': round(stack_acc, 4)
    }

In [4]:
results = []

# 1. Baseline (Real Data)
df_train_real = pd.concat([X_train_real, y_train_real], axis=1)
results.append(evaluate_models(df_train_real, "Baseline (Real Only)"))

# 2. Synthetic Datasets
file_map = {
    "CTGAN": "../data/processed/synthetic_ctgan.csv",
    "TVAE": "../data/processed/synthetic_tvae.csv",
    "ADASYN": "../data/processed/synthetic_adasyn.csv"
}

for name, path in file_map.items():
    if os.path.exists(path):
        df_synth = pd.read_csv(path)
        results.append(evaluate_models(df_synth, f"Synthetic ({name})"))

        # 3. Augmented (Real + Synthetic) - The most important test
        df_aug = pd.concat([df_train_real, df_synth], axis=0)
        results.append(evaluate_models(df_aug, f"Augmented (Real + {name})"))

# Show the Scoreboard
scoreboard = pd.DataFrame(results)
print("\n=== FINAL RESULTS ===")
display(scoreboard)


=== FINAL RESULTS ===


Unnamed: 0,Dataset,RF Accuracy,CatBoost Accuracy,Stacking Accuracy
0,Baseline (Real Only),0.4314,0.4314,0.3725
1,Synthetic (CTGAN),0.3333,0.2941,0.2941
2,Augmented (Real + CTGAN),0.3922,0.4118,0.3922
3,Synthetic (TVAE),0.3333,0.3137,0.3333
4,Augmented (Real + TVAE),0.3725,0.3725,0.3922
5,Synthetic (ADASYN),1.0,1.0,1.0
6,Augmented (Real + ADASYN),1.0,1.0,1.0
