In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Adjust encoding if necessary, e.g., 'latin1'
file_path = "/content/drive/MyDrive/Katabatic/Data/Magic/magic_gamma.csv"
df = pd.read_csv(file_path, encoding="ISO-8859-1")

# 3. Check the loaded data
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

Shape: (19020, 11)
Columns: ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']


Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [None]:

import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# --------------------- 1. Data Preparation for Magic Dataset ---------------------
class MagicDataset(Dataset):
    def __init__(self, file_path):
        # Load the dataset (assuming the file has a header)
        self.data = pd.read_csv(file_path, encoding="ISO-8859-1")
        # Define the numeric feature columns and target column; adjust names if needed.
        numeric_columns = ["fLength", "fWidth", "fSize", "fConc", "fConc1",
                           "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist"]
        target_column = "class"

        # Drop rows with missing values in selected columns
        self.data = self.data[numeric_columns + [target_column]].dropna()

        # Process numeric features: scale them using StandardScaler
        self.features = self.data[numeric_columns].values
        self.scaler = StandardScaler()
        self.features = self.scaler.fit_transform(self.features)

        # Process target: map 'h' to 0 and 'g' to 1 (or vice versa)
        self.data[target_column] = self.data[target_column].map({"h": 0, "g": 1})
        self.labels = self.data[target_column].values

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.FloatTensor(self.features[idx]), torch.LongTensor([self.labels[idx]])

# Set the Magic dataset file path (adjust if needed)
file_path_magic = "/content/drive/MyDrive/Katabatic/Data/Magic/magic_gamma.csv"
dataset_magic = MagicDataset(file_path_magic)

# --------------------- 2. Split the Dataset ---------------------
# Split the dataset into train (80%) and test (20%) subsets using random_split
train_size = int(0.8 * len(dataset_magic))
test_size = len(dataset_magic) - train_size
train_dataset, test_dataset = random_split(dataset_magic, [train_size, test_size])

# Create DataLoader for training
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# --------------------- 3. Define CRGAN Network Components ---------------------
# Generator: maps a 100-dim latent vector to 10-dimensional synthetic features
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim),
            nn.Tanh()  # Tanh squashes output to [-1,1]
        )
    def forward(self, z):
        return self.fc(z)

# Discriminator: distinguishes real from synthetic data
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()  # Outputs probability between 0 and 1
        )
    def forward(self, x):
        return self.fc(x)

# For Magic, output dimension equals the number of numeric features (10)
output_dim_magic = dataset_magic.features.shape[1]

# --------------------- 4. Initialize Models ---------------------
latent_dim = 100
generator_magic = Generator(latent_dim, output_dim_magic)
discriminator_magic = Discriminator(output_dim_magic)

# --------------------- 5. Train CRAMERGAN ---------------------
def train_cramer_gan(generator, discriminator, dataloader, epochs=100):
    device = torch.device('cpu')  # Using CPU for stability in Colab
    generator.to(device)
    discriminator.to(device)

    optimizer_g = optim.Adam(generator.parameters(), lr=0.0002)
    optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0002)
    # Learning rate schedulers: reduce LR every 20 epochs by half
    scheduler_g = optim.lr_scheduler.StepLR(optimizer_g, step_size=20, gamma=0.5)
    scheduler_d = optim.lr_scheduler.StepLR(optimizer_d, step_size=20, gamma=0.5)
    criterion = nn.BCELoss()

    for epoch in range(epochs):
        for real_data, _ in dataloader:
            real_data = real_data.to(device)
            batch_size = real_data.size(0)

            # Train Discriminator
            optimizer_d.zero_grad()
            z = torch.randn(batch_size, latent_dim).to(device)
            fake_data = generator(z).detach()
            real_labels = torch.ones(batch_size, 1).to(device)
            fake_labels = torch.zeros(batch_size, 1).to(device)
            loss_real = criterion(discriminator(real_data), real_labels)
            loss_fake = criterion(discriminator(fake_data), fake_labels)
            loss_d = loss_real + loss_fake
            loss_d.backward()
            optimizer_d.step()

            # Train Generator
            optimizer_g.zero_grad()
            z = torch.randn(batch_size, latent_dim).to(device)
            fake_data = generator(z)
            loss_g = criterion(discriminator(fake_data), real_labels)
            loss_g.backward()
            optimizer_g.step()

        scheduler_g.step()
        scheduler_d.step()
        print(f"Epoch [{epoch+1}/100] - Loss D: {loss_d.item():.4f}, Loss G: {loss_g.item():.4f}")

# Train CRAMERGAN on the training set of the Magic dataset
train_cramer_gan(generator_magic, discriminator_magic, train_dataloader, epochs=100)

# --------------------- 6. Generate Synthetic Data ---------------------
def generate_synthetic_data(generator, num_samples, latent_dim):
    device = torch.device('cpu')
    generator.eval()
    z = torch.randn(num_samples, latent_dim).to(device)
    synthetic_data = generator(z).detach().cpu().numpy()
    return synthetic_data

# Generate synthetic samples equal to the number of training samples
num_synthetic_samples = len(train_dataset)
synthetic_data = generate_synthetic_data(generator_magic, num_synthetic_samples, latent_dim)

# Inverse transform synthetic data back to original scale
scaler_magic = dataset_magic.scaler
synthetic_data = scaler_magic.inverse_transform(synthetic_data)

# Benchmark: Print the number of synthetic samples generated
print("Number of synthetic samples generated:", synthetic_data.shape[0])

# --------------------- 7. Evaluate Models ---------------------
# Extract training data from train_dataset indices
train_indices = train_dataset.indices
X_train_real = dataset_magic.features[train_indices]
y_train_real = np.array(dataset_magic.labels)[train_indices]

# Extract test data from test_dataset indices
test_indices = test_dataset.indices
X_test = dataset_magic.features[test_indices]
y_test = np.array(dataset_magic.labels)[test_indices]

# Train classifiers on synthetic data only and evaluate on real test data
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000, solver='lbfgs', C=1.0),
    "MLP": MLPClassifier(hidden_layer_sizes=(128,64), max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss',
                             n_estimators=200, max_depth=6, learning_rate=0.1)
}

results = []
print("\nClassifier Evaluation Results (trained on synthetic data, tested on real test set):")
for name, model in models.items():
    model.fit(synthetic_data, y_train_real)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results.append((name, acc))
    print(f"{name}: {acc:.4f}")

results_df = pd.DataFrame(results, columns=["Model", "Accuracy"])
print("\n", results_df)

# --------------------- 8. Visualize & Save Synthetic Data ---------------------
# Use the original Magic dataset feature names for synthetic data (numeric columns)
magic_feature_names = ["fLength", "fWidth", "fSize", "fConc", "fConc1",
                       "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist"]

synthetic_df = pd.DataFrame(synthetic_data, columns=magic_feature_names)
print("\nSynthetic Data Sample:")
print(synthetic_df.head())

# Save synthetic data to a CSV file in your Magic folder on Google Drive
csv_save_path = "/content/drive/MyDrive/Katabatic/Data/Magic/synthetic_magic_data.csv"
synthetic_df.to_csv(csv_save_path, index=False)
print("\nSynthetic data saved to:", csv_save_path)


Epoch [1/100] - Loss D: 0.2493, Loss G: 2.2478
Epoch [2/100] - Loss D: 0.0608, Loss G: 4.3491
Epoch [3/100] - Loss D: 0.4379, Loss G: 1.2295
Epoch [4/100] - Loss D: 0.0128, Loss G: 4.5997
Epoch [5/100] - Loss D: 0.1202, Loss G: 5.5813
Epoch [6/100] - Loss D: 0.2027, Loss G: 4.0505
Epoch [7/100] - Loss D: 0.0515, Loss G: 4.2021
Epoch [8/100] - Loss D: 0.0274, Loss G: 5.4164
Epoch [9/100] - Loss D: 0.0025, Loss G: 6.0861
Epoch [10/100] - Loss D: 0.0196, Loss G: 6.2147
Epoch [11/100] - Loss D: 1.2564, Loss G: 0.6883
Epoch [12/100] - Loss D: 0.5558, Loss G: 1.6003
Epoch [13/100] - Loss D: 0.1673, Loss G: 3.8829
Epoch [14/100] - Loss D: 0.1650, Loss G: 3.0473
Epoch [15/100] - Loss D: 0.5108, Loss G: 1.6684
Epoch [16/100] - Loss D: 0.6595, Loss G: 1.4582
Epoch [17/100] - Loss D: 0.3557, Loss G: 2.1695
Epoch [18/100] - Loss D: 0.3358, Loss G: 2.7670
Epoch [19/100] - Loss D: 0.2566, Loss G: 3.5754
Epoch [20/100] - Loss D: 0.4606, Loss G: 1.7125
Epoch [21/100] - Loss D: 0.1973, Loss G: 2.4012
E

Parameters: { "use_label_encoder" } are not used.



XGBoost: 0.6506

                  Model  Accuracy
0  Logistic Regression  0.556782
1                  MLP  0.411935
2        Random Forest  0.653523
3              XGBoost  0.650631

Synthetic Data Sample:
     fLength     fWidth     fSize     fConc    fConc1      fAsym    fM3Long  \
0  39.807240  20.592281  2.960844  0.256464  0.131630  22.458548  22.026249   
1  28.809511  14.938609  2.694967  0.373041  0.215241 -21.046764  -7.962193   
2  33.969578  11.372438  2.497298  0.508130  0.290009  14.294617   5.889857   
3  19.964911  12.558800  2.388766  0.555166  0.319020 -15.303226  10.633015   
4  50.464302  15.097687  2.715757  0.334264  0.191940  36.230103  38.131626   

    fM3Trans     fAlpha       fDist  
0 -19.546249  29.520426  119.089584  
1  16.503553   3.290772  144.310562  
2  -8.085105  18.832630  206.143921  
3  16.218618   5.489205  127.356644  
4   4.624037  20.313755  224.030151  

Synthetic data saved to: /content/drive/MyDrive/Katabatic/Data/Magic/synthetic_magic_data