In [1]:
# === Core Python / System ===
import os

# === Data Handling ===
import pandas as pd
import numpy as np

# === Scikit-learn: Preprocessing, Splitting, Evaluation ===
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# === PyTorch: Core, Models, Optim, Utils ===
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


In [2]:
# === PyTorch: Set device ===
# Load dataset
df = pd.read_csv("Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Drop columns with all NaNs or unnamed indices
df = df.dropna(axis=1, how='all')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Replace inf/-inf with NaN, then fill NaNs with 0
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

# Encode labels: 1 for attack, 0 for normal
df['Label'] = df['Label'].apply(lambda x: 0 if 'BENIGN' in x else 1)

# Drop non-numeric/categorical columns if any
non_numerics = df.select_dtypes(include=['object']).columns
df = df.drop(non_numerics.difference(['Label']), axis=1)

# Separate features and labels
X = df.drop('Label', axis=1).values
y = df['Label'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Training samples:", X_train.shape[0])
print("Feature dimension:", X_train.shape[1])


Training samples: 180596
Feature dimension: 78


In [3]:
# Create PyTorch datasets
# Extract attack samples (label = 1)
X_attack = X_train[y_train == 1]

print(f"Attack sample count: {X_attack.shape[0]}")
print(f"Feature dimension: {X_attack.shape[1]}")


Attack sample count: 102283
Feature dimension: 78


In [4]:
# === MobileGAN: A PyTorch Implementation for Generating Synthetic Attack Data ===
# Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
latent_dim = 64
feature_dim = 78
batch_size = 128
lr = 0.0001
epochs = 100

# ===== Generator =====
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(True),
            nn.Linear(256, feature_dim),
        )

    def forward(self, z):
        return self.model(z)

# ===== Discriminator =====
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(feature_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# ===== Load and prepare attack data =====
X_attack_tensor = torch.tensor(X_attack, dtype=torch.float32)
train_loader = DataLoader(TensorDataset(X_attack_tensor), batch_size=batch_size, shuffle=True)

# ===== Initialize models and training utils =====
generator = Generator().to(device)
discriminator = Discriminator().to(device)
criterion = nn.BCELoss()
optimizer_G = optim.Adam(generator.parameters(), lr=lr)
optimizer_D = optim.Adam(discriminator.parameters(), lr=lr)

# ===== Training Loop =====
for epoch in range(epochs):
    for real_batch, in train_loader:
        real_batch = real_batch.to(device)
        batch_size_curr = real_batch.size(0)

        # Real and fake label smoothing
        real_labels = torch.ones(batch_size_curr, 1).uniform_(0.9, 1.0).to(device)
        fake_labels = torch.zeros(batch_size_curr, 1).uniform_(0.0, 0.1).to(device)

        # === Train Discriminator ===
        z = torch.randn(batch_size_curr, latent_dim).to(device)
        fake_data = generator(z)

        d_real = discriminator(real_batch)
        d_fake = discriminator(fake_data.detach())

        loss_real = criterion(d_real, real_labels)
        loss_fake = criterion(d_fake, fake_labels)
        d_loss = loss_real + loss_fake

        optimizer_D.zero_grad()
        d_loss.backward()
        nn.utils.clip_grad_norm_(discriminator.parameters(), 1.0)
        optimizer_D.step()

        # === Train Generator ===
        z = torch.randn(batch_size_curr, latent_dim).to(device)
        fake_data = generator(z)
        g_loss = criterion(discriminator(fake_data), real_labels)

        optimizer_G.zero_grad()
        g_loss.backward()
        nn.utils.clip_grad_norm_(generator.parameters(), 1.0)
        optimizer_G.step()

    print(f"Epoch [{epoch+1}/{epochs}] | D Loss: {d_loss.item():.4f} | G Loss: {g_loss.item():.4f}")

# ===== Save the trained generator =====
torch.save(generator.state_dict(), "mobilegan_generator_stable.pth")

# ===== Function to generate synthetic samples =====
def generate_synthetic_samples(generator, num_samples=1000):
    generator.eval()
    with torch.no_grad():
        z = torch.randn(num_samples, latent_dim).to(device)
        synthetic_data = generator(z)
    return synthetic_data.cpu().numpy()

# ===== Generate and Save Synthetic Attack Data =====
synthetic_attacks = generate_synthetic_samples(generator, num_samples=1000)
df_synthetic = pd.DataFrame(synthetic_attacks)
df_synthetic.to_csv("synthetic_attacks.csv", index=False)
print(" Synthetic attack data saved as 'synthetic_attacks.csv'")


Epoch [1/100] | D Loss: 1.2838 | G Loss: 0.9839
Epoch [2/100] | D Loss: 1.0026 | G Loss: 0.9653
Epoch [3/100] | D Loss: 1.0443 | G Loss: 0.9967
Epoch [4/100] | D Loss: 1.0955 | G Loss: 1.3717
Epoch [5/100] | D Loss: 1.2105 | G Loss: 0.8957
Epoch [6/100] | D Loss: 1.0916 | G Loss: 1.1129
Epoch [7/100] | D Loss: 1.1721 | G Loss: 1.1737
Epoch [8/100] | D Loss: 1.2574 | G Loss: 1.2661
Epoch [9/100] | D Loss: 1.0849 | G Loss: 1.2260
Epoch [10/100] | D Loss: 1.1224 | G Loss: 1.2387
Epoch [11/100] | D Loss: 0.8911 | G Loss: 1.4563
Epoch [12/100] | D Loss: 0.8432 | G Loss: 1.9138
Epoch [13/100] | D Loss: 0.9891 | G Loss: 1.1659
Epoch [14/100] | D Loss: 1.0835 | G Loss: 1.1176
Epoch [15/100] | D Loss: 0.8273 | G Loss: 1.3246
Epoch [16/100] | D Loss: 0.8162 | G Loss: 1.6889
Epoch [17/100] | D Loss: 0.9587 | G Loss: 2.5905
Epoch [18/100] | D Loss: 0.8299 | G Loss: 0.9611
Epoch [19/100] | D Loss: 0.8508 | G Loss: 1.4989
Epoch [20/100] | D Loss: 1.1027 | G Loss: 1.4324
Epoch [21/100] | D Loss: 1.07

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
real_attacks = X_attack
synthetic_attacks = pd.read_csv("synthetic_attacks.csv").values

# Balance real and synthetic samples
min_len = min(len(real_attacks), len(synthetic_attacks))
real_balanced = real_attacks[:min_len]
synthetic_balanced = synthetic_attacks[:min_len]

# Assign labels
real_labels = np.ones((min_len, 1))
synthetic_labels = np.zeros((min_len, 1))

# Combine for evaluation
X_eval = np.vstack([real_balanced, synthetic_balanced])
y_eval = np.vstack([real_labels, synthetic_labels]).flatten()

# === PCA + t-SNE Visualization ===
def plot_visualizations(X_data, y_data, title_suffix=""):
    # PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_data)
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_data, palette=["blue", "red"], alpha=0.6)
    plt.title(f"PCA Visualization: Real (1) vs Synthetic (0) {title_suffix}")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.legend(title="Label")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # t-SNE
    tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)
    X_tsne = tsne.fit_transform(X_data)
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1], hue=y_data, palette=["blue", "red"], alpha=0.6)
    plt.title(f"t-SNE Visualization: Real (1) vs Synthetic (0) {title_suffix}")
    plt.xlabel("t-SNE Dim 1")
    plt.ylabel("t-SNE Dim 2")
    plt.legend(title="Label")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

plot_visualizations(X_eval, y_eval, "(Attack Vectors)")

# Stratified train-test split
X_train_eval, X_test_eval, y_train_eval, y_test_eval = train_test_split(
    X_eval, y_eval, test_size=0.2, random_state=42, stratify=y_eval
)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_eval, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_eval, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_eval, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_eval, dtype=torch.float32).view(-1, 1)

# DataLoader
eval_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=128, shuffle=True)

# Discriminator Model
class Discriminator(nn.Module):
    def __init__(self, input_dim=78):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
disc_eval = Discriminator().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(disc_eval.parameters(), lr=0.0001)

# Train discriminator
epochs = 10
for epoch in range(epochs):
    disc_eval.train()
    epoch_loss = 0
    for xb, yb in eval_loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = disc_eval(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

# Evaluate
disc_eval.eval()
with torch.no_grad():
    preds = disc_eval(X_test_tensor.to(device)).cpu().numpy().flatten()
    preds_binary = (preds >= 0.5).astype(int)

# Print metrics
print("\n📊 Discriminator Evaluation Report:")
print(classification_report(y_test_eval, preds_binary, digits=4))
print(f"ROC-AUC Score: {roc_auc_score(y_test_eval, preds):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_eval, preds_binary))


Epoch 1/10, Loss: 8.9143
Epoch 2/10, Loss: 8.7138
Epoch 3/10, Loss: 8.5955
Epoch 4/10, Loss: 8.5333
Epoch 5/10, Loss: 8.4487
Epoch 6/10, Loss: 8.4785
Epoch 7/10, Loss: 8.4330
Epoch 8/10, Loss: 8.4304
Epoch 9/10, Loss: 8.3908
Epoch 10/10, Loss: 8.3658

📊 Discriminator Evaluation Report:
              precision    recall  f1-score   support

         0.0     0.6126    0.7750    0.6843       200
         1.0     0.6939    0.5100    0.5879       200

    accuracy                         0.6425       400
   macro avg     0.6533    0.6425    0.6361       400
weighted avg     0.6533    0.6425    0.6361       400

ROC-AUC Score: 0.7461

Confusion Matrix:
[[155  45]
 [ 98 102]]
