In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torch.autograd import grad as torch_grad
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
from tqdm import tqdm
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/Colab Notebooks/Katabatic/CrGAN/car/cross/

# Suppress warnings
warnings.filterwarnings("ignore")

# Setting random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if CUDA is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def load_car_data(data_path, shuffle=False, random_state=42):
    """
    Load and preprocess the car dataset with option to shuffle
    """
    # Column names for the car dataset
    column_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

    # Load data
    df = pd.read_csv(data_path, header=None, names=column_names)

    # Shuffle data if requested
    if shuffle:
        df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    print(f"Dataset shape: {df.shape}")

    # Identify categorical columns
    categorical_cols = df.drop('class', axis=1).columns.tolist()  # All columns except class are categorical

    # Create preprocessing pipeline for categorical data using one-hot encoding
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Get unique classes and create encoders
    unique_classes = sorted(df['class'].unique())
    label_encoder = {cls: i for i, cls in enumerate(unique_classes)}
    inverse_label_encoder = {i: cls for cls, i in label_encoder.items()}

    # Store original categorical values
    cat_values = {}
    for col in categorical_cols:
        cat_values[col] = sorted(df[col].unique())

    # Split features and target
    X = df.drop('class', axis=1)
    y = df['class']

    # Encode targets
    y_encoded = y.map(label_encoder)

    # Fit and transform the features
    X_transformed = preprocessor.fit_transform(X)

    # Get one-hot encoding feature names
    cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
    feature_names = cat_encoder.get_feature_names_out(categorical_cols)

    print(f"Processed data shape: {X_transformed.shape}")

    return (X_transformed, y_encoded, X, y, preprocessor, feature_names,
            label_encoder, inverse_label_encoder, cat_values, df)

# Custom dataset class
class CarDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        if labels is not None:
            self.labels = torch.tensor(labels.values if hasattr(labels, 'values') else labels, dtype=torch.long)
        else:
            self.labels = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

# Generator Network
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()
        self.latent_dim = latent_dim
        self.output_dim = output_dim

        self.model = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.2),

            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2),

            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2),

            nn.Linear(512, output_dim),
            nn.Sigmoid()  # Use Sigmoid for one-hot encoded data
        )

    def forward(self, z):
        return self.model(z)

# Critic Network (Discriminator)
class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.input_dim = input_dim

        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),

            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.model(x)

# Cramer GAN Implementation
class CramerGAN:
    def __init__(self, data_dim, latent_dim=100, critic_iterations=5, lambda_gp=10):
        self.data_dim = data_dim
        self.latent_dim = latent_dim
        self.critic_iterations = critic_iterations
        self.lambda_gp = lambda_gp

        # Initialize networks
        self.generator = Generator(latent_dim, data_dim).to(device)
        self.critic = Critic(data_dim).to(device)

        # Setup optimizers
        self.g_optimizer = optim.Adam(self.generator.parameters(), lr=0.0002, betas=(0.5, 0.9))
        self.c_optimizer = optim.Adam(self.critic.parameters(), lr=0.0002, betas=(0.5, 0.9))

        # Initialize loss tracking
        self.g_losses = []
        self.c_losses = []

    def _critic_train_iteration(self, real_data, batch_size):
        # Generate random noise
        noise = torch.randn(batch_size, self.latent_dim).to(device)

        # Generate fake data
        fake_data = self.generator(noise)

        # Get critic outputs
        critic_real = self.critic(real_data)
        critic_fake = self.critic(fake_data)

        # Calculate Cramer distance
        critic_real2 = self.critic(torch.roll(real_data, shifts=1, dims=0))
        critic_fake2 = self.critic(torch.roll(fake_data, shifts=1, dims=0))

        # Cramer GAN loss function
        c_loss = torch.mean(critic_real - critic_fake) - 0.5 * torch.mean(torch.pow(critic_real - critic_real2, 2)) + 0.5 * torch.mean(torch.pow(critic_fake - critic_fake2, 2))

        # Calculate gradient penalty
        alpha = torch.rand(batch_size, 1).to(device)
        interpolates = alpha * real_data + ((1 - alpha) * fake_data)
        interpolates.requires_grad_(True)

        critic_interpolates = self.critic(interpolates)
        gradients = torch_grad(outputs=critic_interpolates, inputs=interpolates,
                              grad_outputs=torch.ones_like(critic_interpolates).to(device),
                              create_graph=True, retain_graph=True)[0]

        gradients = gradients.view(batch_size, -1)
        gradient_penalty = self.lambda_gp * ((gradients.norm(2, dim=1) - 1) ** 2).mean()

        # Update critic
        self.c_optimizer.zero_grad()
        c_loss_total = c_loss + gradient_penalty
        c_loss_total.backward()
        self.c_optimizer.step()

        return c_loss_total.item()

    def _generator_train_iteration(self, batch_size):
        # Generate random noise
        noise = torch.randn(batch_size, self.latent_dim).to(device)

        # Generate fake data
        fake_data = self.generator(noise)

        # Calculate critic outputs
        critic_fake = self.critic(fake_data)
        critic_fake2 = self.critic(torch.roll(fake_data, shifts=1, dims=0))

        # Generator loss is negative of critic loss
        g_loss = -torch.mean(critic_fake) + 0.5 * torch.mean(torch.pow(critic_fake - critic_fake2, 2))

        # Update generator
        self.g_optimizer.zero_grad()
        g_loss.backward()
        self.g_optimizer.step()

        return g_loss.item()

    def train(self, data_loader, epochs, save_interval=10, verbose=True):
        for epoch in range(epochs):
            epoch_start_time = time.time()
            c_loss_total = 0
            g_loss_total = 0
            num_batches = 0

            for i, (real_data, _) in enumerate(data_loader):
                batch_size = real_data.size(0)
                real_data = real_data.to(device)

                # Train critic
                for _ in range(self.critic_iterations):
                    c_loss = self._critic_train_iteration(real_data, batch_size)
                c_loss_total += c_loss

                # Train generator
                g_loss = self._generator_train_iteration(batch_size)
                g_loss_total += g_loss

                num_batches += 1

            # Calculate average loss for the epoch
            c_loss_avg = c_loss_total / num_batches
            g_loss_avg = g_loss_total / num_batches

            self.c_losses.append(c_loss_avg)
            self.g_losses.append(g_loss_avg)

            epoch_time = time.time() - epoch_start_time

            if verbose and (epoch % save_interval == 0 or epoch == epochs - 1):
                print(f"Epoch [{epoch+1}/{epochs}] | Critic Loss: {c_loss_avg:.4f} | Generator Loss: {g_loss_avg:.4f} | Time: {epoch_time:.2f}s")

    def generate_samples(self, num_samples):
        self.generator.eval()
        noise = torch.randn(num_samples, self.latent_dim).to(device)
        with torch.no_grad():
            generated_data = self.generator(noise).cpu().numpy()
        self.generator.train()
        return generated_data

    def save_model(self, path):
        torch.save({
            'generator_state_dict': self.generator.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'g_optimizer_state_dict': self.g_optimizer.state_dict(),
            'c_optimizer_state_dict': self.c_optimizer.state_dict(),
        }, path)

    def load_model(self, path):
        checkpoint = torch.load(path)
        self.generator.load_state_dict(checkpoint['generator_state_dict'])
        self.critic.load_state_dict(checkpoint['critic_state_dict'])
        self.g_optimizer.load_state_dict(checkpoint['g_optimizer_state_dict'])
        self.c_optimizer.load_state_dict(checkpoint['c_optimizer_state_dict'])

# Post-process generated data for categorical features
def post_process_car_data(synthetic_data, preprocessor, cat_values, feature_names):
    """
    Post-process the synthetic data to convert one-hot encoded features back to categorical values
    """
    # Create a DataFrame with one-hot encoded columns
    synthetic_df = pd.DataFrame(synthetic_data, columns=feature_names)

    # Extract categorical feature groups
    result_df = pd.DataFrame()

    # Process each categorical column
    for col_name, values in cat_values.items():
        # Get one-hot columns for this feature
        col_pattern = f"{col_name}_"
        category_cols = [c for c in feature_names if c.startswith(col_pattern)]

        # Get the most likely category for each sample
        category_probs = synthetic_df[category_cols].values
        category_indices = np.argmax(category_probs, axis=1)

        # Map indices back to original categories
        # Extract the original category from the one-hot column name
        categories = [c.split('_', 1)[1] for c in category_cols]
        result_df[col_name] = [categories[idx] for idx in category_indices]

    return result_df

# Evaluation Metrics

# 1. Machine Learning Utility (TSTR)
def evaluate_tstr(real_data, synthetic_data, real_labels, random_state=42):
    """
    Train classifiers on synthetic data and test on real data (TSTR)
    Returns accuracy for each classifier
    """
    # Train-test split for real data
    X_train, X_test, y_train, y_test = train_test_split(
        real_data, real_labels, test_size=0.2, random_state=random_state
    )

    # Synthetic data (all used for training)
    X_synth = synthetic_data

    # Ensure proper dimensions for labels
    if isinstance(y_train, pd.Series):
        y_train = y_train.values

    # Create synthetic labels based on real distribution
    num_classes = len(np.unique(y_train))
    class_distribution = np.bincount(y_train.astype(int), minlength=num_classes) / len(y_train)
    np.random.seed(random_state)
    y_synth = np.random.choice(range(num_classes), size=len(X_synth), p=class_distribution)

    # Define classifiers
    classifiers = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=random_state),
        'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=random_state),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=random_state),
        'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=random_state)
    }

    results = {}

    for name, clf in classifiers.items():
        # Train on synthetic data
        clf.fit(X_synth, y_synth)

        # Test on real data
        y_pred = clf.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)

        # We use weighted f1 since we have multiple classes
        f1 = f1_score(y_test, y_pred, average='weighted')

        results[name] = {
            'accuracy': accuracy,
            'f1_score': f1
        }

    return results

# 2. Statistical Similarity for categorical data
def evaluate_categorical_similarity(real_df, synthetic_df):
    """
    Calculate statistical similarity for categorical features
    """
    results = {'JSD': {}}

    # For each categorical column, calculate JSD
    for col in real_df.columns:
        # Get value counts
        real_counts = real_df[col].value_counts(normalize=True).sort_index()
        synth_counts = synthetic_df[col].value_counts(normalize=True).sort_index()

        # Align the distributions
        all_categories = sorted(set(real_counts.index) | set(synth_counts.index))
        real_dist = np.array([real_counts.get(cat, 0) for cat in all_categories])
        synth_dist = np.array([synth_counts.get(cat, 0) for cat in all_categories])

        # Add small epsilon to avoid zeros
        epsilon = 1e-10
        real_dist = real_dist + epsilon
        synth_dist = synth_dist + epsilon

        # Normalize
        real_dist = real_dist / real_dist.sum()
        synth_dist = synth_dist / synth_dist.sum()

        # Calculate JSD
        jsd = jensen_shannon_divergence(real_dist, synth_dist)
        results['JSD'][col] = jsd

    # Average JSD across all features
    results['JSD_avg'] = np.mean(list(results['JSD'].values()))

    return results

def jensen_shannon_divergence(p, q):
    """
    Calculate Jensen-Shannon Divergence between distributions p and q
    """
    # Ensure p and q are normalized
    p = p / np.sum(p)
    q = q / np.sum(q)

    m = 0.5 * (p + q)

    # Calculate JSD
    jsd = 0.5 * (entropy(p, m) + entropy(q, m))

    return jsd

def plot_loss_curves(model, fold=None, shuffle=False):
    """
    Plot the loss curves for the generator and critic
    """
    title_suffix = f" (Fold {fold+1})" if fold is not None else ""
    shuffle_suffix = " (Shuffled)" if shuffle else ""

    plt.figure(figsize=(10, 5))
    plt.plot(model.g_losses, label='Generator Loss')
    plt.plot(model.c_losses, label='Critic Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'CramerGAN Training Loss for Car Dataset{title_suffix}{shuffle_suffix}')
    plt.legend()
    plt.grid(True)

    # Save figure with appropriate naming
    if fold is not None:
        shuffle_text = "shuffled_" if shuffle else ""
        plt.savefig(f'car_crgan_loss_curves_{shuffle_text}fold_{fold+1}.png')
    else:
        plt.savefig('car_crgan_loss_curves.png')

    plt.close()

def plot_categorical_distributions(real_df, synthetic_df, fold=None, shuffle=False):
    """
    Plot distributions of real vs synthetic data for categorical features
    """
    n_features = len(real_df.columns)
    title_suffix = f" (Fold {fold+1})" if fold is not None else ""
    shuffle_suffix = " (Shuffled)" if shuffle else ""

    plt.figure(figsize=(15, n_features * 4))

    for i, col in enumerate(real_df.columns):
        plt.subplot(n_features, 1, i+1)

        # Calculate proportions
        real_props = real_df[col].value_counts(normalize=True).sort_index()
        synth_props = synthetic_df[col].value_counts(normalize=True).sort_index()

        # Get all categories
        all_categories = sorted(set(real_props.index) | set(synth_props.index))

        # Create a DataFrame for plotting
        plot_df = pd.DataFrame({
            'Category': all_categories * 2,
            'Proportion': [real_props.get(cat, 0) for cat in all_categories] +
                         [synth_props.get(cat, 0) for cat in all_categories],
            'Type': ['Real'] * len(all_categories) + ['Synthetic'] * len(all_categories)
        })

        # Plot
        sns.barplot(x='Category', y='Proportion', hue='Type', data=plot_df)
        plt.title(f'Distribution for {col}{title_suffix}{shuffle_suffix}')
        plt.xticks(rotation=45)
        plt.ylabel('Proportion')
        plt.legend()

    plt.tight_layout()

    # Save figure with appropriate naming
    if fold is not None:
        shuffle_text = "shuffled_" if shuffle else ""
        plt.savefig(f'car_categorical_distributions_{shuffle_text}fold_{fold+1}.png')
    else:
        plt.savefig('car_categorical_distributions.png')

    plt.close()

def plot_class_distribution(real_labels, synthetic_labels, label_encoder, fold=None, shuffle=False):
    """
    Plot the class distribution of real vs synthetic data
    """
    title_suffix = f" (Fold {fold+1})" if fold is not None else ""
    shuffle_suffix = " (Shuffled)" if shuffle else ""

    plt.figure(figsize=(12, 6))

    # Get class counts
    real_class_counts = pd.Series(real_labels).value_counts(normalize=True)
    synth_class_counts = pd.Series(synthetic_labels).value_counts(normalize=True)

    # Create inverse label encoder
    inverse_label_encoder = {v: k for k, v in label_encoder.items()}

    # Get all classes
    all_classes = sorted(set(real_class_counts.index) | set(synth_class_counts.index))

    # Create plot data
    plot_df = pd.DataFrame({
        'Class': [inverse_label_encoder.get(c, c) for c in all_classes] * 2,
        'Proportion': [real_class_counts.get(c, 0) for c in all_classes] +
                     [synth_class_counts.get(c, 0) for c in all_classes],
        'Type': ['Real'] * len(all_classes) + ['Synthetic'] * len(all_classes)
    })

    # Plot
    sns.barplot(x='Class', y='Proportion', hue='Type', data=plot_df)
    plt.title(f'Class Distribution: Real vs Synthetic Car Evaluations{title_suffix}{shuffle_suffix}')
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Save figure with appropriate naming
    if fold is not None:
        shuffle_text = "shuffled_" if shuffle else ""
        plt.savefig(f'car_class_distribution_{shuffle_text}fold_{fold+1}.png')
    else:
        plt.savefig('car_class_distribution.png')

    plt.close()

def run_cross_validation(data_path, shuffle=False, random_state=123, n_folds=5, epochs=100):
    """
    Run k-fold cross-validation for CrGAN on the car dataset
    """
    # Load and preprocess data
    (X_transformed, y_encoded, X_original, y_original,
     preprocessor, feature_names, label_encoder, inverse_label_encoder,
     cat_values, full_df) = load_car_data(data_path, shuffle=shuffle, random_state=random_state)

    # Initialize KFold cross-validator
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)

    # Store results for each fold
    tstr_results_all = []
    jsd_results_all = []

    # Define latent dimension
    latent_dim = 100

    fold_results = {}

    # Run cross-validation
    for fold, (train_idx, test_idx) in enumerate(kf.split(X_transformed)):
        print(f"\n{'='*50}")
        print(f"Running {'Shuffled ' if shuffle else ''}Fold {fold+1}/{n_folds}")
        print(f"{'='*50}")

        # Split data for this fold
        X_train_fold, X_test_fold = X_transformed[train_idx], X_transformed[test_idx]
        y_train_fold, y_test_fold = y_encoded.iloc[train_idx], y_encoded.iloc[test_idx]
        X_original_train, X_original_test = X_original.iloc[train_idx], X_original.iloc[test_idx]

        # Create dataset and dataloader
        train_dataset = CarDataset(X_train_fold, y_train_fold)
        train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

        # Initialize CramerGAN model
        data_dim = X_train_fold.shape[1]
        print(f"Data dimension: {data_dim}")
        print(f"Latent dimension: {latent_dim}")

        crgan = CramerGAN(data_dim, latent_dim)

        # Train the model
        print(f"Training CramerGAN for {epochs} epochs...")
        crgan.train(train_loader, epochs, save_interval=20)

        # Save the model
        model_name = f"car_crgan_model_{'shuffled_' if shuffle else ''}fold_{fold+1}.pt"
        # crgan.save_model(model_name)

        # Plot loss curves
        # plot_loss_curves(crgan, fold=fold, shuffle=shuffle)

        # Generate synthetic data
        num_samples = 1000
        print(f"Generating {num_samples} synthetic samples...")
        synthetic_data_raw = crgan.generate_samples(num_samples)

        # Post-process the synthetic data
        synthetic_df = post_process_car_data(synthetic_data_raw, preprocessor, cat_values, feature_names)

        # Generate synthetic labels using a classifier trained on real data
        clf = RandomForestClassifier(n_estimators=100, random_state=42)
        clf.fit(X_train_fold, y_train_fold)
        synthetic_labels_raw = clf.predict(synthetic_data_raw)

        # Convert numeric labels to original class values
        synthetic_labels = [inverse_label_encoder[label] for label in synthetic_labels_raw]

        # Add class labels to the synthetic dataframe
        synthetic_df['class'] = synthetic_labels

        # Save the synthetic data
        # synthetic_df.to_csv(f'synthetic_car_data_{"shuffled_" if shuffle else ""}fold_{fold+1}.csv', index=False)

        # Plot distributions
        # plot_categorical_distributions(X_original_train, synthetic_df.drop('class', axis=1), fold=fold, shuffle=shuffle)

        # Plot class distribution
        # plot_class_distribution(y_original.iloc[train_idx], synthetic_df['class'], label_encoder, fold=fold, shuffle=shuffle)

        # Statistical similarity evaluation
        print("Evaluating statistical similarity...")
        stat_results = evaluate_categorical_similarity(X_original_train, synthetic_df.drop('class', axis=1))

        print("\nJensen-Shannon Divergence (average):", stat_results['JSD_avg'])
        print("\nJSD per feature:")
        for feature, jsd in stat_results['JSD'].items():
            print(f"  {feature}: {jsd:.4f}")

        # Machine Learning Utility (TSTR) evaluation
        print("\nEvaluating Machine Learning Utility (TSTR)...")
        tstr_results = evaluate_tstr(X_transformed, synthetic_data_raw, y_encoded)

        print("\nTSTR Results:")
        for clf_name, metrics in tstr_results.items():
            print(f"{clf_name}: Accuracy = {metrics['accuracy']:.4f}, F1 Score = {metrics['f1_score']:.4f}")

        # Store results for this fold
        fold_results[f"{'shuffled_' if shuffle else ''}fold_{fold+1}"] = {
            'tstr': tstr_results,
            'jsd': stat_results
        }

        # Collect results for averaging later
        tstr_results_all.append(tstr_results)
        jsd_results_all.append(stat_results)

    return fold_results, tstr_results_all, jsd_results_all

def aggregate_results(results_list, result_type='tstr'):
    """
    Aggregate results across all folds and runs
    """
    if result_type == 'tstr':
        # Initialize aggregate dictionary
        aggregate = {}

        # Get all classifier names from the first result
        classifiers = list(results_list[0].keys())

        # Initialize aggregate dictionary with metrics
        for clf in classifiers:
            aggregate[clf] = {'accuracy': [], 'f1_score': []}

        # Collect all results
        for result in results_list:
            for clf, metrics in result.items():
                aggregate[clf]['accuracy'].append(metrics['accuracy'])
                aggregate[clf]['f1_score'].append(metrics['f1_score'])

        # Calculate averages and standard deviations
        final_results = {}
        for clf, metrics in aggregate.items():
            final_results[clf] = {
                'accuracy_mean': np.mean(metrics['accuracy']),
                'accuracy_std': np.std(metrics['accuracy']),
                'f1_score_mean': np.mean(metrics['f1_score']),
                'f1_score_std': np.std(metrics['f1_score'])
            }

        return final_results

    elif result_type == 'jsd':
        # Initialize aggregate dictionary
        aggregate = {'JSD': {}, 'JSD_avg': []}

        # Get all feature names from the first result
        features = list(results_list[0]['JSD'].keys())

        # Initialize feature-specific JSD lists
        for feature in features:
            aggregate['JSD'][feature] = []

        # Collect all results
        for result in results_list:
            for feature, jsd in result['JSD'].items():
                aggregate['JSD'][feature].append(jsd)
            aggregate['JSD_avg'].append(result['JSD_avg'])

        # Calculate averages and standard deviations
        final_results = {'JSD': {}, 'JSD_avg_mean': np.mean(aggregate['JSD_avg']), 'JSD_avg_std': np.std(aggregate['JSD_avg'])}

        for feature, jsd_values in aggregate['JSD'].items():
            final_results['JSD'][feature] = {
                'mean': np.mean(jsd_values),
                'std': np.std(jsd_values)
            }

        return final_results

def main():
    # File path
    data_path = 'car.csv'
    n_folds = 5
    epochs = 300

    # Run cross-validation with original data order
    print("\n" + "="*80)
    print("RUNNING CROSS-VALIDATION WITH ORIGINAL DATA ORDER")
    print("="*80 + "\n")
    original_fold_results, original_tstr_results, original_jsd_results = run_cross_validation(
        data_path, shuffle=False, n_folds=n_folds, epochs=epochs
    )

    # Run cross-validation with shuffled data
    print("\n" + "="*80)
    print("RUNNING CROSS-VALIDATION WITH SHUFFLED DATA (SEED=123)")
    print("="*80 + "\n")
    shuffled_fold_results, shuffled_tstr_results, shuffled_jsd_results = run_cross_validation(
        data_path, shuffle=True, random_state=123, n_folds=n_folds, epochs=epochs
    )

    # Combine results from both runs
    all_tstr_results = original_tstr_results + shuffled_tstr_results
    all_jsd_results = original_jsd_results + shuffled_jsd_results

    # Aggregate results
    print("\n" + "="*80)
    print("AGGREGATING RESULTS ACROSS ALL 10 RUNS")
    print("="*80 + "\n")

    # Aggregate TSTR results
    agg_tstr_results = aggregate_results(all_tstr_results, result_type='tstr')

    print("\nAverage Machine Learning Utility (TSTR) Results:")
    print("-" * 80)
    print(f"{'Classifier':<20} | {'Accuracy Mean':<15} | {'Accuracy Std':<15} | {'F1 Score Mean':<15} | {'F1 Score Std':<15}")
    print("-" * 80)
    for clf_name, metrics in agg_tstr_results.items():
        print(f"{clf_name:<20} | {metrics['accuracy_mean']:.4f} ± {metrics['accuracy_std']:.4f} | " +
              f"{metrics['accuracy_std']:.4f} | {metrics['f1_score_mean']:.4f} | {metrics['f1_score_std']:.4f}")

    # Aggregate JSD results
    agg_jsd_results = aggregate_results(all_jsd_results, result_type='jsd')

    print("\nAverage Statistical Similarity Results (JSD):")
    print("-" * 60)
    print(f"Overall JSD: {agg_jsd_results['JSD_avg_mean']:.4f} ± {agg_jsd_results['JSD_avg_std']:.4f}")
    print("\nFeature-specific JSDs:")
    print(f"{'Feature':<15} | {'JSD Mean':<15} | {'JSD Std':<15}")
    print("-" * 60)
    for feature, metrics in agg_jsd_results['JSD'].items():
        print(f"{feature:<15} | {metrics['mean']:.4f} | {metrics['std']:.4f}")

    # Create and save summary results
    summary_results = {
        "Machine Learning Utility": agg_tstr_results,
        "Statistical Similarity": agg_jsd_results
    }

    # Save all results to disk
    import pickle
    with open('car_crgan_cross_validation_results.pkl', 'wb') as f:
        pickle.dump({
            'original_fold_results': original_fold_results,
            'shuffled_fold_results': shuffled_fold_results,
            'aggregated_results': summary_results
        }, f)

    # Create summary plots
    create_summary_plots(agg_tstr_results, agg_jsd_results)

    print("\nEvaluation complete! Check the output directory for plots and saved models.")

def create_summary_plots(tstr_results, jsd_results):
    """
    Create summary plots for the cross-validation results
    """
    # 1. Create accuracy bar plot
    plt.figure(figsize=(12, 6))
    classifiers = list(tstr_results.keys())
    accuracies = [tstr_results[clf]['accuracy_mean'] for clf in classifiers]
    errors = [tstr_results[clf]['accuracy_std'] for clf in classifiers]

    bar_positions = np.arange(len(classifiers))
    plt.bar(bar_positions, accuracies, yerr=errors, alpha=0.7, capsize=10)
    plt.xticks(bar_positions, classifiers, rotation=45)
    plt.title('Average Classification Accuracy Across 10 Runs')
    plt.xlabel('Classifier')
    plt.ylabel('Accuracy')
    plt.tight_layout()
    plt.savefig('car_crgan_avg_accuracy.png')
    plt.close()

    # 2. Create F1 score bar plot
    plt.figure(figsize=(12, 6))
    f1_scores = [tstr_results[clf]['f1_score_mean'] for clf in classifiers]
    f1_errors = [tstr_results[clf]['f1_score_std'] for clf in classifiers]

    plt.bar(bar_positions, f1_scores, yerr=f1_errors, alpha=0.7, capsize=10)
    plt.xticks(bar_positions, classifiers, rotation=45)
    plt.title('Average F1 Score Across 10 Runs')
    plt.xlabel('Classifier')
    plt.ylabel('F1 Score')
    plt.tight_layout()
    plt.savefig('car_crgan_avg_f1_score.png')
    plt.close()

    # 3. Create JSD bar plot
    plt.figure(figsize=(12, 6))
    features = list(jsd_results['JSD'].keys())
    jsds = [jsd_results['JSD'][feature]['mean'] for feature in features]
    jsd_errors = [jsd_results['JSD'][feature]['std'] for feature in features]

    bar_positions = np.arange(len(features))
    plt.bar(bar_positions, jsds, yerr=jsd_errors, alpha=0.7, capsize=10)
    plt.xticks(bar_positions, features, rotation=45)
    plt.title('Average Jensen-Shannon Divergence Across 10 Runs')
    plt.xlabel('Feature')
    plt.ylabel('JSD')
    plt.tight_layout()
    plt.savefig('car_crgan_avg_jsd.png')
    plt.close()

    # 4. Create boxplot of JSD values per feature
    plt.figure(figsize=(14, 8))

    # Prepare data for boxplot
    feature_jsd_values = []
    for feature in features:
        values = []
        for fold in range(10):  # 5 original + 5 shuffled
            fold_idx = fold // 5
            within_fold_idx = fold % 5
            if fold_idx == 0:
                # Original data
                fold_name = f"fold_{within_fold_idx+1}"
            else:
                # Shuffled data
                fold_name = f"shuffled_fold_{within_fold_idx+1}"
        feature_jsd_values.append(values)

    plt.boxplot(feature_jsd_values, labels=features)
    plt.title('Distribution of JSD Values Across All Runs')
    plt.xlabel('Feature')
    plt.ylabel('JSD')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('car_crgan_jsd_distribution.png')
    plt.close()


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Katabatic/CrGAN/car/cross
Using device: cpu


In [2]:
if __name__ == "__main__":
    main()


RUNNING CROSS-VALIDATION WITH ORIGINAL DATA ORDER

Dataset shape: (1729, 7)
Processed data shape: (1729, 27)

Running Fold 1/5
Data dimension: 27
Latent dimension: 100
Training CramerGAN for 300 epochs...
Epoch [1/300] | Critic Loss: 3.8569 | Generator Loss: -0.5922 | Time: 5.71s
Epoch [21/300] | Critic Loss: -373.8811 | Generator Loss: 77.6456 | Time: 1.06s
Epoch [41/300] | Critic Loss: -227967.2472 | Generator Loss: 71002.2269 | Time: 1.04s
Epoch [61/300] | Critic Loss: -9044872.2273 | Generator Loss: 3013995.6136 | Time: 1.31s
Epoch [81/300] | Critic Loss: -108174624.7273 | Generator Loss: 40998244.9091 | Time: 1.13s
Epoch [101/300] | Critic Loss: -787329879.2727 | Generator Loss: 261220960.0000 | Time: 1.15s
Epoch [121/300] | Critic Loss: -3069637236.3636 | Generator Loss: 1215560768.0000 | Time: 1.14s
Epoch [141/300] | Critic Loss: -10387100997.8182 | Generator Loss: 4504132072.7273 | Time: 1.35s
Epoch [161/300] | Critic Loss: -36595777349.8182 | Generator Loss: 13410617157.8182 