## Imports

In [1]:
from medgan_model import Medgan
from data_loader import load_letter_data
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import pandas as pd

In [2]:
from medgan_model import Medgan
from data_loader import load_letter_data
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler


## Load and Preprocess Data

In [4]:
# --- Load and preprocess real shuttle data ---
df = pd.read_csv("datasets/letter-recognition-2.csv", header=None)
X_real = df.iloc[:, :-1]
y_real = df.iloc[:, -1]

In [5]:
# --- Load data without labels for GAN training ---
X_train, X_test = load_letter_data("datasets/letter-recognition-2.csv")
input_dim = X_train.shape[1]

In [6]:
# --- Load Data ---
X_train, X_test = load_letter_data("datasets/letter-recognition-2.csv")
input_dim = X_train.shape[1]

## Initialize and Train MedGAN

In [7]:
# --- Initialize and Train MedGAN ---
medgan = Medgan(input_dim=input_dim, ae_loss_type='bce')
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002)
batch_size = 64
n_epochs = 300

for epoch in range(n_epochs):
    np.random.shuffle(X_train)
    for i in range(0, len(X_train), batch_size):
        batch = X_train[i:i + batch_size]
        noise = np.random.normal(size=(batch.shape[0], medgan.random_dim))
        medgan.train_step(batch, noise)
    print(f"Epoch {epoch+1}/{n_epochs} completed")

Epoch 1/300 completed
Epoch 2/300 completed
Epoch 3/300 completed
Epoch 4/300 completed
Epoch 5/300 completed
Epoch 6/300 completed
Epoch 7/300 completed
Epoch 8/300 completed
Epoch 9/300 completed
Epoch 10/300 completed
Epoch 11/300 completed
Epoch 12/300 completed
Epoch 13/300 completed
Epoch 14/300 completed
Epoch 15/300 completed
Epoch 16/300 completed
Epoch 17/300 completed
Epoch 18/300 completed
Epoch 19/300 completed
Epoch 20/300 completed
Epoch 21/300 completed
Epoch 22/300 completed
Epoch 23/300 completed
Epoch 24/300 completed
Epoch 25/300 completed
Epoch 26/300 completed
Epoch 27/300 completed
Epoch 28/300 completed
Epoch 29/300 completed
Epoch 30/300 completed
Epoch 31/300 completed
Epoch 32/300 completed
Epoch 33/300 completed
Epoch 34/300 completed
Epoch 35/300 completed
Epoch 36/300 completed
Epoch 37/300 completed
Epoch 38/300 completed
Epoch 39/300 completed
Epoch 40/300 completed
Epoch 41/300 completed
Epoch 42/300 completed
Epoch 43/300 completed
Epoch 44/300 complet

In [8]:
medgan = Medgan(input_dim=input_dim, ae_loss_type='bce')


In [9]:
# --- Initialize and Compile Model ---
medgan = Medgan(input_dim=input_dim, ae_loss_type='bce')  # or 'mse' for real-valued
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0002)

In [10]:
# --- Training Loop ---
ae_losses, d_losses, g_losses = [], [], []

for epoch in range(n_epochs):
    np.random.shuffle(X_train)
    for i in range(0, len(X_train), batch_size):
        batch = X_train[i:i + batch_size]
        noise = np.random.normal(size=(batch.shape[0], medgan.random_dim))
        ae_loss, d_loss, g_loss = medgan.train_step(batch, noise)

    ae_losses.append(ae_loss.numpy())
    d_losses.append(d_loss.numpy())
    g_losses.append(g_loss.numpy())

    print(f"Epoch {epoch + 1}/{n_epochs} | AE: {ae_loss:.4f}, D: {d_loss:.4f}, G: {g_loss:.4f}")

Epoch 1/300 | AE: 0.6904, D: 1.3977, G: 0.6259
Epoch 2/300 | AE: 0.6900, D: 1.4051, G: 0.6257
Epoch 3/300 | AE: 0.6905, D: 1.3996, G: 0.6273
Epoch 4/300 | AE: 0.6902, D: 1.4009, G: 0.6261
Epoch 5/300 | AE: 0.6903, D: 1.4031, G: 0.6267
Epoch 6/300 | AE: 0.6902, D: 1.3968, G: 0.6283
Epoch 7/300 | AE: 0.6899, D: 1.4002, G: 0.6268
Epoch 8/300 | AE: 0.6905, D: 1.4050, G: 0.6265
Epoch 9/300 | AE: 0.6898, D: 1.3981, G: 0.6260
Epoch 10/300 | AE: 0.6897, D: 1.4005, G: 0.6252
Epoch 11/300 | AE: 0.6893, D: 1.3996, G: 0.6286
Epoch 12/300 | AE: 0.6909, D: 1.4004, G: 0.6279
Epoch 13/300 | AE: 0.6903, D: 1.3998, G: 0.6239
Epoch 14/300 | AE: 0.6909, D: 1.4035, G: 0.6271
Epoch 15/300 | AE: 0.6907, D: 1.3920, G: 0.6247
Epoch 16/300 | AE: 0.6900, D: 1.3971, G: 0.6278
Epoch 17/300 | AE: 0.6904, D: 1.4013, G: 0.6263
Epoch 18/300 | AE: 0.6889, D: 1.4050, G: 0.6251
Epoch 19/300 | AE: 0.6902, D: 1.3942, G: 0.6282
Epoch 20/300 | AE: 0.6902, D: 1.4018, G: 0.6265
Epoch 21/300 | AE: 0.6905, D: 1.3945, G: 0.6283
E

In [12]:
# --- Load Data ---
X_train, X_test = load_letter_data("datasets/letter-recognition-2.csv")
input_dim = X_train.shape[1]

## Generate Synthetic Data

In [15]:

# --- Define X_train_real and related preprocessing variables ---
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Don't use header=None here
df = pd.read_csv("datasets/letter-recognition-2.csv")

X_real = df.iloc[:, 1:]   # All columns except 'letter'
y_real = df.iloc[:, 0]    # 'letter' column

le = LabelEncoder()
y_encoded = le.fit_transform(y_real)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_real)

X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42
)


In [16]:
# --- Generate Synthetic Data ---
synthetic_data = medgan.generate_data(num_samples=len(X_train_real))
synthetic_data_scaled = scaler.transform(synthetic_data)  # Scale similarly



In [17]:
# --- Generate Synthetic Data ---
synthetic_data = medgan.generate_data(num_samples=1000)
print("Generated Data Shape:", synthetic_data.shape)


Generated Data Shape: (1000, 16)


In [18]:
# --- Generate Synthetic Data ---
synthetic_data = medgan.generate_data(num_samples=1000)
print("Generated Data Shape:", synthetic_data.shape)


Generated Data Shape: (1000, 16)


## Classifier Evaluation

In [19]:
# Encode target for XGBClassifier
le = LabelEncoder()
y_encoded = le.fit_transform(y_real)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_real)

# Train-test split
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [20]:
# --- Classifier Evaluation ---
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "MLP Classifier": MLPClassifier(max_iter=300, random_state=42),
    "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=300, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train_real, y_train_real)
    y_pred = model.predict(X_test_real)
    results[name] = {
        "Accuracy": accuracy_score(y_test_real, y_pred),
        "Precision": precision_score(y_test_real, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_test_real, y_pred, average='macro', zero_division=0),
        "F1 Score": f1_score(y_test_real, y_pred, average='macro', zero_division=0)
    }

results_df = pd.DataFrame(results).T
print("\nClassifier Evaluation on Real Data:")
print(results_df)

Parameters: { "use_label_encoder" } are not used.




Classifier Evaluation on Real Data:
                     Accuracy  Precision    Recall  F1 Score
Random Forest         0.96125   0.961784  0.960185  0.960461
MLP Classifier        0.95700   0.956456  0.956381  0.956184
XGB Classifier        0.96225   0.962060  0.961441  0.961459
Logistic Regression   0.78200   0.779501  0.780292  0.778645
