# Final Clean Version: MedGAN Training with 5-Fold CV, 7 Runs, and Averaged Classifier Evaluation

In [None]:

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from data_loader import load_shuttle_data
from medgan_model import Medgan


In [None]:

# --- Load and preprocess real shuttle data ---
df = pd.read_csv("datasets/shuttle.csv", header=None)  # <-- Change to your correct path
X_real = df.iloc[:, :-1]
y_real = df.iloc[:, -1]

# Encode target once
le = LabelEncoder()
y_encoded = le.fit_transform(y_real)

# Normalize features once
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_real)

# Train-test split on real data once
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)


In [None]:

# --- Load data without labels for GAN training ---
X_train, X_test = load_shuttle_data("datasets/shuttle.csv", test_size=0.2, normalize=True, n_shuffle=10)
input_dim = X_train.shape[1]
print("Data loaded successfully.")


In [None]:

# --- Set Parameters ---
k_folds = 5
n_runs = 7
n_epochs = 100
batch_size = 64
learning_rate = 0.0002

print(f"Parameters set: Folds={k_folds}, Runs={n_runs}, Epochs={n_epochs}, Batch Size={batch_size}")


In [None]:

# --- Training MedGAN with 5-Fold Cross Validation and 7 Times ---

from collections import defaultdict

# To collect all results
all_run_results = []

for run in range(n_runs):
    print(f"\n===== Starting Run {run+1}/{n_runs} =====")
    
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=run)
    
    fold_results = defaultdict(list)
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        print(f"  --- Fold {fold+1}/{k_folds} ---")
        
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        
        # Initialize MedGAN model
        medgan = Medgan(input_dim=input_dim, ae_loss_type='bce')
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        
        # Train MedGAN
        for epoch in range(n_epochs):
            np.random.shuffle(X_tr)
            for i in range(0, len(X_tr), batch_size):
                batch = X_tr[i:i+batch_size]
                noise = np.random.normal(size=(batch.shape[0], medgan.random_dim))
                ae_loss, d_loss, g_loss = medgan.train_step(batch, noise)
            
            print(f"Epoch {epoch+1}/{n_epochs} completed", end="\r")
        
        # Generate synthetic data
        synthetic_data = medgan.generate_data(num_samples=len(X_real))
        synthetic_scaled = scaler.transform(synthetic_data)
        
        # Split synthetic data
        X_train_syn, X_test_syn, y_train_syn, y_test_syn = train_test_split(synthetic_scaled, y_encoded, test_size=0.2, random_state=42)
        
        # Define classifiers
        models = {
            "Random Forest": RandomForestClassifier(random_state=42),
            "MLP Classifier": MLPClassifier(max_iter=300, random_state=42),
            "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
            "Logistic Regression": LogisticRegression(max_iter=300, random_state=42)
        }
        
        # Evaluate classifiers
        results = {}
        for name, model in models.items():
            model.fit(X_train_syn, y_train_syn)
            y_pred = model.predict(X_test_real)
            results[name] = {
                "Accuracy": accuracy_score(y_test_real, y_pred),
                "Precision": precision_score(y_test_real, y_pred, average='macro', zero_division=0),
                "Recall": recall_score(y_test_real, y_pred, average='macro', zero_division=0),
                "F1 Score": f1_score(y_test_real, y_pred, average='macro', zero_division=0)
            }
        
        fold_results[fold] = results
    
    all_run_results.append(fold_results)


In [None]:

# --- Final Averaging across all runs ---

aggregate = {}

for run in all_run_results:
    for fold, model_scores in run.items():
        for model, scores in model_scores.items():
            if model not in aggregate:
                aggregate[model] = {"Accuracy": [], "Precision": [], "Recall": [], "F1 Score": []}
            for metric in scores:
                aggregate[model][metric].append(scores[metric])

# Average
final_scores = {}
for model, metrics in aggregate.items():
    final_scores[model] = {metric: np.mean(values) for metric, values in metrics.items()}

# Display
final_df = pd.DataFrame(final_scores).T
print("\n===== Final Averaged Classifier Results Over 7 Runs =====\n")
print(final_df)
