In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/Katabatic/Data/shuttle/shuttle.trn'
df = pd.read_csv(file_path, sep='\s+', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,50,21,77,0,28,0,27,48,22,2
1,55,0,92,0,0,26,36,92,56,4
2,53,0,82,0,52,-5,29,30,2,1
3,37,0,76,0,28,18,40,48,8,1
4,37,0,79,0,34,-26,43,46,2,1


In [None]:

import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Data Preparation
class ShuttleDataset(Dataset):
    def __init__(self, file_path):
        # Load data with whitespace delimiter; no header in file
        self.data = pd.read_csv(file_path, delim_whitespace=True, header=None)
        self.scaler = StandardScaler()

        # All columns except the last are features; last column is label (adjust from 1-7 to 0-6)
        self.features = self.data.iloc[:, :-1].values
        self.labels = self.data.iloc[:, -1].values - 1
        self.features = self.scaler.fit_transform(self.features)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.FloatTensor(self.features[idx]), torch.LongTensor([self.labels[idx]])

# Set the file path (adjust to your Google Drive folder structure)
file_path = "/content/drive/MyDrive/Katabatic/Data/shuttle/shuttle.trn"
dataset = ShuttleDataset(file_path)

# Split the Dataset
# Use random_split to divide the dataset into train (80%) and test (20%)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Alternatively, you can use train_test_split on the entire dataset features/labels
# X_train, X_test, y_train, y_test = train_test_split(dataset.features, dataset.labels, test_size=0.2, random_state=42)

# Create DataLoader for training
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define Network Component
# Deeper Generator network
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim),
            nn.Tanh()  # Tanh squashes outputs to [-1, 1]
        )

    def forward(self, z):
        return self.fc(z)

# Deeper Discriminator network
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()  # Output probability
        )

    def forward(self, x):
        return self.fc(x)

#Initialize Models
latent_dim = 100
generator = Generator(latent_dim, dataset.features.shape[1])
discriminator = Discriminator(dataset.features.shape[1])

#Train CRAMERGAN with Learning Rate Schedulers
def train_cramer_gan(generator, discriminator, dataloader, epochs=100):
    device = torch.device('cpu')  # Force CPU for stability
    generator.to(device)
    discriminator.to(device)

    optimizer_g = optim.Adam(generator.parameters(), lr=0.0002)
    optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0002)
    # Learning rate schedulers: reduce LR every 20 epochs by half
    scheduler_g = optim.lr_scheduler.StepLR(optimizer_g, step_size=20, gamma=0.5)
    scheduler_d = optim.lr_scheduler.StepLR(optimizer_d, step_size=20, gamma=0.5)
    criterion = nn.BCELoss()

    for epoch in range(epochs):
        for real_data, _ in dataloader:
            real_data = real_data.to(device)
            batch_size = real_data.size(0)

            # Train Discriminator
            optimizer_d.zero_grad()
            z = torch.randn(batch_size, latent_dim).to(device)
            fake_data = generator(z).detach()
            real_labels = torch.ones(batch_size, 1).to(device)
            fake_labels = torch.zeros(batch_size, 1).to(device)
            loss_real = criterion(discriminator(real_data), real_labels)
            loss_fake = criterion(discriminator(fake_data), fake_labels)
            loss_d = loss_real + loss_fake
            loss_d.backward()
            optimizer_d.step()

            # Train Generator
            optimizer_g.zero_grad()
            z = torch.randn(batch_size, latent_dim).to(device)
            fake_data = generator(z)
            loss_g = criterion(discriminator(fake_data), real_labels)
            loss_g.backward()
            optimizer_g.step()

        scheduler_g.step()
        scheduler_d.step()
        print(f"Epoch [{epoch+1}/{epochs}] - Loss D: {loss_d.item():.4f}, Loss G: {loss_g.item():.4f}")

# Train CRAMERGAN on the training set only
train_cramer_gan(generator, discriminator, train_dataloader, epochs=100)

# Generate Synthetic Data
def generate_synthetic_data(generator, num_samples, latent_dim):
    device = torch.device('cpu')
    generator.eval()
    z = torch.randn(num_samples, latent_dim).to(device)
    synthetic_data = generator(z).detach().cpu().numpy()
    return synthetic_data

# Generate synthetic samples equal to the number of training samples
num_synthetic_samples = len(train_dataset)
synthetic_data = generate_synthetic_data(generator, num_synthetic_samples, latent_dim)

# Inverse transform synthetic data to original scale
scaler = dataset.scaler
synthetic_data = scaler.inverse_transform(synthetic_data)

# Benchmark: Print number of synthetic samples generated
print("Number of synthetic samples generated:", synthetic_data.shape[0])

#Evaluate Models
# For classifier training, we will mix real and synthetic training data 50/50.
# Extract real training data (features and labels) from train_dataset
train_indices = train_dataset.indices  # These are the indices from the original dataset
X_train_real = dataset.features[train_indices]
y_train_real = np.array(dataset.labels)[train_indices]

# Combine synthetic data with real training data
combined_data = np.concatenate([X_train_real, synthetic_data], axis=0)
combined_labels = np.concatenate([y_train_real, y_train_real], axis=0)

# Use test set from random_split for evaluation
test_indices = test_dataset.indices
X_test = dataset.features[test_indices]
y_test = np.array(dataset.labels)[test_indices]

# Initialize classifiers with tuned hyperparameters
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000, solver='lbfgs', C=1.0),
    "MLP": MLPClassifier(hidden_layer_sizes=(128,64), max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=200, max_depth=6, learning_rate=0.1)
}

results = []
print("\nClassifier Evaluation Results (trained on 50/50 combined data, tested on real test set):")
for name, model in models.items():
    model.fit(combined_data, combined_labels)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results.append((name, acc))
    print(f"{name}: {acc:.4f}")

results_df = pd.DataFrame(results, columns=["Model", "Accuracy"])
print("\n", results_df)


  self.data = pd.read_csv(file_path, delim_whitespace=True, header=None)


Epoch [1/100] - Loss D: 0.4429, Loss G: 2.2246
Epoch [2/100] - Loss D: 0.4622, Loss G: 2.1171
Epoch [3/100] - Loss D: 0.6123, Loss G: 1.9669
Epoch [4/100] - Loss D: 0.8521, Loss G: 1.5285
Epoch [5/100] - Loss D: 1.2542, Loss G: 0.9948
Epoch [6/100] - Loss D: 0.9271, Loss G: 1.0488
Epoch [7/100] - Loss D: 1.1312, Loss G: 0.9343
Epoch [8/100] - Loss D: 0.9351, Loss G: 1.0368
Epoch [9/100] - Loss D: 1.0200, Loss G: 1.0565
Epoch [10/100] - Loss D: 1.0402, Loss G: 1.0515
Epoch [11/100] - Loss D: 1.0065, Loss G: 0.9572
Epoch [12/100] - Loss D: 1.0739, Loss G: 0.9508
Epoch [13/100] - Loss D: 1.0857, Loss G: 0.9819
Epoch [14/100] - Loss D: 0.8548, Loss G: 1.0965
Epoch [15/100] - Loss D: 1.0152, Loss G: 1.1284
Epoch [16/100] - Loss D: 1.0332, Loss G: 1.0676
Epoch [17/100] - Loss D: 0.8859, Loss G: 1.0870
Epoch [18/100] - Loss D: 1.0951, Loss G: 1.1138
Epoch [19/100] - Loss D: 0.9584, Loss G: 1.1423
Epoch [20/100] - Loss D: 0.8521, Loss G: 1.1208
Epoch [21/100] - Loss D: 0.9601, Loss G: 1.0708
E

Parameters: { "use_label_encoder" } are not used.



XGBoost: 0.9994

                  Model  Accuracy
0  Logistic Regression  0.840920
1                  MLP  0.998966
2        Random Forest  0.999425
3              XGBoost  0.999425


In [None]:
# --- Generate Synthetic Data and Save to CSV ---

# Define the synthetic data generation function if not already defined
def generate_synthetic_data(generator, num_samples, latent_dim):
    import torch
    device = torch.device('cpu')  # using CPU
    generator.eval()
    # Generate synthetic samples using random noise
    z = torch.randn(num_samples, latent_dim).to(device)
    synthetic_data = generator(z).detach().cpu().numpy()
    return synthetic_data

# Generate synthetic data equal to the number of real samples in the dataset
num_synthetic_samples = len(dataset)  # Or change to desired number
synthetic_data = generate_synthetic_data(generator, num_synthetic_samples, latent_dim)

# Inverse transform synthetic data to the original scale using the dataset's scaler
scaler = dataset.scaler
synthetic_data = scaler.inverse_transform(synthetic_data)

# Convert the synthetic data to a DataFrame
import pandas as pd
synthetic_df = pd.DataFrame(synthetic_data, columns=[f"Feature {i}" for i in range(synthetic_data.shape[1])])

# Display the first few rows and the number of synthetic samples generated
print("Synthetic Data Sample:")
print(synthetic_df.head())
print("Number of synthetic samples generated:", synthetic_df.shape[0])
csv_save_path = "/content/drive/MyDrive/Katabatic/Data/shuttle/synthetic_shuttle_data.csv"
synthetic_df.to_csv(csv_save_path, index=False)
print("Synthetic data saved to:", csv_save_path)


Synthetic Data Sample:
   Feature 0  Feature 1  Feature 2  Feature 3  Feature 4  Feature 5  \
0  43.970901  -2.194612  76.998077  -1.191187  44.705463   8.539552   
1  37.018997  -4.740286  76.980888   1.130539  36.164509  12.483049   
2  46.616024  -2.046196  78.349693  -1.913286  46.613731  11.427321   
3  36.798141  -5.676992  76.970856   1.376222  34.304890   0.064882   
4  46.290962  -1.709328  79.283478  -1.689941  45.675858   1.922915   

   Feature 6  Feature 7  Feature 8  
0  33.353672  32.578854   0.402071  
1  40.154476  40.740238   0.924166  
2  31.749687  31.007275   0.517446  
3  40.178055  42.646454   2.346477  
4  33.028225  32.559162   0.246216  
Number of synthetic samples generated: 43500
Synthetic data saved to: /content/drive/MyDrive/Katabatic/Data/shuttle/synthetic_shuttle_data.csv
