# Real vs Synthetic Performance Comparison Across Models

In [None]:

# Load trained models and data
%run SupervisedModels/OptimizedRandomForest.ipynb
RF_Model_Metrics = {
    'F1': final_f1,
    'Roc_Auc': final_roc_auc,
    'accuracy': accuracy
}
%run SupervisedModels/OptimizedXGBoost.ipynb
XGB_Model_Metrics = {
    'F1': final_f1,
    'Roc_Auc': final_roc_auc,
    'accuracy': accuracy
}
%run SupervisedModels/OptimizedCatBoost.ipynb
CB_Model_Metrics = {
    'F1': final_f1,
    'Roc_Auc': final_roc_auc,
    'accuracy': accuracy
}
%run SupervisedModels/OptimizedLogisticRegression.ipynb
LR_Model_Metrics = {
    'F1': final_f1,
    'Roc_Auc': final_roc_auc,
    'accuracy': accuracy
}

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Synthetic data generation
X_syn, y_syn = make_classification(n_samples=len(X), n_features=X.shape[1],
                                   n_informative=10, n_redundant=5, n_clusters_per_class=2,
                                   weights=[0.75, 0.25], flip_y=0.01, random_state=42)

X_train_syn, X_val_syn, y_train_syn, y_val_syn = train_test_split(
    X_syn, y_syn, test_size=0.2, stratify=y_syn, random_state=42
)

X_train_syn, y_train_syn = SMOTE(random_state=42).fit_resample(X_train_syn, y_train_syn)


In [None]:

models = {
    'RandomForest': (rf_model, RF_Model_Metrics),
    'XGBoost': (xgb_model, XGB_Model_Metrics),
    # 'CatBoost': (cb_model, CB_Model_Metrics),
    'LogisticRegression': (lr_model, LR_Model_Metrics)
}

def evaluate_model(model, X_val, y_val, metrics=None):
    if(metrics is not None):
        return {
            'ROC AUC': metrics['Roc_Auc'],
            'F1': metrics['F1'],
            'Accuracy': metrics['accuracy']
        }
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)[:, 1]
    return {
        'ROC AUC': roc_auc_score(y_val, y_prob),
        'F1': f1_score(y_val, y_pred),
        'Accuracy': accuracy_score(y_val, y_pred)
    }

real_scores = {}
synthetic_scores = {}

# Evaluate real data models
for name, model in models.items():
    real_scores[name] = evaluate_model(model[0], X_holdout, y_holdout, metrics=model[1])

# Retrain on synthetic and evaluate
for name, model in models.items():
    model[0].fit(X_train_syn, y_train_syn)
    synthetic_scores[name] = evaluate_model(model[0], X_val_syn, y_val_syn)


In [None]:

print("🔍 Model Performance: Real vs Synthetic Comparison")
for name in models.keys():
    print(f"\nModel: {name}")
    for metric in ['ROC AUC', 'F1', 'Accuracy']:
        real_val = real_scores[name][metric]
        syn_val = synthetic_scores[name][metric]
        drop = real_val - syn_val
        print(f"{metric}: Real = {real_val:.4f}, Synthetic = {syn_val:.4f}, Drop = {drop:.4f}")


🔍 Model Performance: Real vs Synthetic Comparison

Model: RandomForest
ROC AUC: Real = 0.9808, Synthetic = 0.9423, Drop = 0.0385
F1: Real = 0.8519, Synthetic = 0.7500, Drop = 0.1019
Accuracy: Real = 0.9375, Synthetic = 0.8875, Drop = 0.0500

Model: XGBoost
ROC AUC: Real = 0.8706, Synthetic = 0.9827, Drop = -0.1121
F1: Real = 0.6269, Synthetic = 0.8889, Drop = -0.2620
Accuracy: Real = 0.8438, Synthetic = 0.9437, Drop = -0.1000

Model: LogisticRegression
ROC AUC: Real = 0.8945, Synthetic = 0.8577, Drop = 0.0368
F1: Real = 0.6216, Synthetic = 0.6214, Drop = 0.0003
Accuracy: Real = 0.8250, Synthetic = 0.7562, Drop = 0.0687
