# Improved MedGAN Training with Better Synthetic Data and Classifier Evaluation

In [1]:

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from data_loader import load_shuttle_data
from medgan_model import Medgan


In [2]:

# --- Load and preprocess real shuttle data ---
df = pd.read_csv("datasets/shuttle.csv", header=None)  # Update path if needed
X_real = df.iloc[:, :-1]
y_real = df.iloc[:, -1]

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_real)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_real)

# Split for classifier testing
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)


In [3]:

# --- Load GAN Training Data ---
X_train, X_test = load_shuttle_data("datasets/shuttle.csv", test_size=0.2, normalize=True, n_shuffle=10)
input_dim = X_train.shape[1]
print("Data loaded for GAN training.")


Data loaded for GAN training.


In [4]:

# --- Set Parameters ---
n_epochs = 100
batch_size = 64
learning_rate = 0.0002

print(f"Parameters set: Epochs={n_epochs}, Batch Size={batch_size}")


Parameters set: Epochs=100, Batch Size=64


In [5]:

# --- Train MedGAN ---

medgan = Medgan(input_dim=input_dim, ae_loss_type='bce')
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

for epoch in range(n_epochs):
    np.random.shuffle(X_train)
    for i in range(0, len(X_train), batch_size):
        batch = X_train[i:i+batch_size]
        noise = np.random.normal(size=(batch.shape[0], medgan.random_dim))
        ae_loss, d_loss, g_loss = medgan.train_step(batch, noise)
    
    print(f"Epoch {epoch+1}/{n_epochs} completed", end="\r")


Epoch 100/100 completed

In [None]:

# --- Generate Synthetic Data (More samples) ---

synthetic_data = medgan.generate_data(num_samples=len(X_real))  # Generate 5000 samples
synthetic_scaled = scaler.transform(synthetic_data)  # Scale similarly
print(f"Generated Synthetic Data Shape: {synthetic_scaled.shape}")


In [7]:

# --- Classifier Evaluation on Improved Synthetic Data ---

# Split synthetic data
X_train_syn, X_test_syn, y_train_syn, y_test_syn = train_test_split(synthetic_scaled, y_encoded, test_size=0.2, random_state=42)

models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "MLP Classifier": MLPClassifier(max_iter=300, random_state=42),
    "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=300, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train_syn, y_train_syn)
    y_pred = model.predict(X_test_real)
    results[name] = {
        "Accuracy": accuracy_score(y_test_real, y_pred),
        "Precision": precision_score(y_test_real, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_test_real, y_pred, average='macro', zero_division=0),
        "F1 Score": f1_score(y_test_real, y_pred, average='macro', zero_division=0)
    }

# Display results
results_df = pd.DataFrame(results).T
print("\nClassifier Evaluation on Real Data (Using Improved Synthetic Data):\n")
print(results_df)


ValueError: Found input variables with inconsistent numbers of samples: [5000, 14500]