In [4]:
import os
import json
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
import joblib

# === Config ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
latent_dim = 64
batch_size = 128
lr = 1e-4
gan_epochs = 100
ids_epochs = 10
feature_file = "features_used.json"
gan_model_path = "mobilegan_generator_stable.pth"
synthetic_csv = "synthetic_attacks.csv"
torchscript_path = "ids_cnn_edgegensec.pt"

# === Load and clean dataset ===
df = pd.read_csv(r"CICIDS2017_Multiclass_Balanced_5k.csv")
df.columns = df.columns.str.strip()
df = df.dropna(axis=1, how='all')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

# Encode labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'].astype(str))
benign_label = label_encoder.transform(["BENIGN"])[0]

# Drop non-numeric columns except Label
non_numerics = df.select_dtypes(include=['object']).columns
df = df.drop(non_numerics.difference(['Label']), axis=1)

# === Feature and label split ===
X = df.drop('Label', axis=1).values
y = df['Label'].values
features = df.drop('Label', axis=1).columns.tolist()

# Save features
with open(feature_file, "w") as f:
    json.dump(features, f)

# === Normalize and split ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, "scaler.pkl")

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)
X_attack = X_train[y_train != benign_label]  # Only attack flows

# === MobileGAN Models ===
class Generator(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(True),
            nn.Linear(256, feature_dim),
        )

    def forward(self, z):
        return self.model(z)

class Discriminator(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(feature_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# === Train GAN ===
feature_dim = X_attack.shape[1]
X_attack_tensor = torch.tensor(X_attack, dtype=torch.float32)
train_loader = DataLoader(TensorDataset(X_attack_tensor), batch_size=batch_size, shuffle=True)

generator = Generator(feature_dim).to(device)
discriminator = Discriminator(feature_dim).to(device)
criterion = nn.BCELoss()
optimizer_G = optim.Adam(generator.parameters(), lr=lr)
optimizer_D = optim.Adam(discriminator.parameters(), lr=lr)

print("🚀 Training GAN on attack data...")
for epoch in range(gan_epochs):
    for real_batch, in train_loader:
        real_batch = real_batch.to(device)
        b = real_batch.size(0)

        real_labels = torch.ones(b, 1).uniform_(0.9, 1.0).to(device)
        fake_labels = torch.zeros(b, 1).uniform_(0.0, 0.1).to(device)

        z = torch.randn(b, latent_dim).to(device)
        fake_data = generator(z)

        d_loss = criterion(discriminator(real_batch), real_labels) + \
                 criterion(discriminator(fake_data.detach()), fake_labels)
        optimizer_D.zero_grad()
        d_loss.backward()
        optimizer_D.step()

        z = torch.randn(b, latent_dim).to(device)
        g_loss = criterion(discriminator(generator(z)), real_labels)
        optimizer_G.zero_grad()
        g_loss.backward()
        optimizer_G.step()

    print(f"[Epoch {epoch+1}/{gan_epochs}] D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")

torch.save(generator.state_dict(), gan_model_path)
print(f"✅ GAN Generator saved to {gan_model_path}")

# === Generate synthetic attacks ===
def generate_synthetic_samples(generator, n=1000):
    generator.eval()
    with torch.no_grad():
        z = torch.randn(n, latent_dim).to(device)
        return generator(z).cpu().numpy()




🚀 Training GAN on attack data...
[Epoch 1/100] D Loss: 1.2886, G Loss: 0.9266
[Epoch 2/100] D Loss: 1.2572, G Loss: 1.0055
[Epoch 3/100] D Loss: 1.2176, G Loss: 0.9975
[Epoch 4/100] D Loss: 1.3333, G Loss: 0.8921
[Epoch 5/100] D Loss: 1.2326, G Loss: 1.1107
[Epoch 6/100] D Loss: 1.1786, G Loss: 0.9331
[Epoch 7/100] D Loss: 1.2217, G Loss: 0.9130
[Epoch 8/100] D Loss: 1.2704, G Loss: 0.9149
[Epoch 9/100] D Loss: 1.0895, G Loss: 1.0676
[Epoch 10/100] D Loss: 1.1582, G Loss: 1.0705
[Epoch 11/100] D Loss: 1.1698, G Loss: 1.0283
[Epoch 12/100] D Loss: 1.1342, G Loss: 0.9903
[Epoch 13/100] D Loss: 1.0597, G Loss: 1.1789
[Epoch 14/100] D Loss: 1.0322, G Loss: 1.2528
[Epoch 15/100] D Loss: 1.0615, G Loss: 1.1298
[Epoch 16/100] D Loss: 1.0078, G Loss: 1.1588
[Epoch 17/100] D Loss: 1.0040, G Loss: 1.3260
[Epoch 18/100] D Loss: 1.0890, G Loss: 1.2601
[Epoch 19/100] D Loss: 0.9194, G Loss: 1.3971
[Epoch 20/100] D Loss: 0.9379, G Loss: 1.3283
[Epoch 21/100] D Loss: 0.9752, G Loss: 1.2962
[Epoch 22/

In [5]:
synthetic_attacks = generate_synthetic_samples(generator, n=1000)
pd.DataFrame(synthetic_attacks).to_csv(synthetic_csv, index=False)
print(f"✅ Synthetic attacks saved to {synthetic_csv}")

✅ Synthetic attacks saved to synthetic_attacks.csv


In [None]:
# === ONLY NEW IMPORTS ===
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import joblib

# === Load and Preprocess CSV ===
df = pd.read_csv("CICIDS2017_Multiclass_Balanced_5k.csv")
df.columns = df.columns.str.strip()
df = df.dropna(axis=1, how='all')
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

# === Label Encoding ===
label_encoder = joblib.load("label_encoder.pkl")
df['Label'] = label_encoder.transform(df['Label'].astype(str))

# === Drop non-numeric columns ===
non_numerics = df.select_dtypes(include=['object']).columns
df = df.drop(non_numerics.difference(['Label']), axis=1)

# === Synthetic Data Generation Using Gaussian Noise ===
def generate_synthetic_data(real_df, label, target_size):
    real_samples = real_df[real_df['Label'] == label].drop("Label", axis=1).values
    if len(real_samples) == 0:
        raise ValueError("No real samples found for the label {}".format(label))
    
    mu = real_samples.mean(axis=0)
    sigma = real_samples.std(axis=0)
    synthetic_data = np.random.normal(loc=mu, scale=sigma, size=(target_size, real_samples.shape[1]))
    df_synth = pd.DataFrame(synthetic_data, columns=real_df.columns[:-1])
    df_synth['Label'] = label_encoder.transform(["SYNTHETIC"])[0] if "SYNTHETIC" in label_encoder.classes_ else df['Label'].max() + 1
    return df_synth

# === Generate and Combine Synthetic Data ===
synth_df = generate_synthetic_data(df, label=df['Label'].mode()[0], target_size=5000)
df_combined = pd.concat([df, synth_df], ignore_index=True)

# === Feature/Label separation and standardization ===
feature_cols = df_combined.drop("Label", axis=1).columns.tolist()
X = df_combined[feature_cols].values
y = df_combined["Label"].values
input_len = len(feature_cols)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, "scaler.pkl")
with open("features_list.pkl", "wb") as f:
    joblib.dump(feature_cols, f)

# === Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# === Torch Dataloaders ===
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=128, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=128)

# === Multiclass CNN Architecture ===
class CNNMulticlassIDS(nn.Module):
    def __init__(self, input_len, num_classes):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Flatten(),
            nn.Linear((input_len // 4) * 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.network(x)

# === Training ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = len(np.unique(y))
model = CNNMulticlassIDS(input_len=input_len, num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)
epochs = 15

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# === Evaluation ===
model.eval()
all_preds = []
all_probs = []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        logits = model(xb)
        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())

print("\n\U0001F4CA Multiclass IDS Evaluation:")
print(classification_report(y_test, all_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_test, all_preds))

# === Save TorchScript model ===
example_input = torch.rand(1, 1, input_len).to(device)
traced_model = torch.jit.trace(model, example_input)
traced_model.save("ids_cnn_multiclass.pt")
print("✅ Saved: ids_cnn_multiclass.pt")


Epoch 1/15, Loss: 63.0885
Epoch 2/15, Loss: 35.4427
Epoch 3/15, Loss: 29.6421
Epoch 4/15, Loss: 24.0962
Epoch 5/15, Loss: 21.2017
Epoch 6/15, Loss: 19.7634
Epoch 7/15, Loss: 18.9510
Epoch 8/15, Loss: 18.6097
Epoch 9/15, Loss: 17.4554
Epoch 10/15, Loss: 17.4702
Epoch 11/15, Loss: 16.9774
Epoch 12/15, Loss: 16.5780
Epoch 13/15, Loss: 15.7725
Epoch 14/15, Loss: 15.5160
Epoch 15/15, Loss: 14.8289

📊 IDS Classifier Evaluation Report:
              precision    recall  f1-score   support

         0.0     0.9833    0.8850    0.9316       800
         1.0     0.9869    0.9983    0.9926      6957

    accuracy                         0.9866      7757
   macro avg     0.9851    0.9416    0.9621      7757
weighted avg     0.9866    0.9866    0.9863      7757

ROC-AUC Score: 0.9927
Confusion Matrix:
[[ 708   92]
 [  12 6945]]
✅ TorchScript model saved as 'ids_cnn_edgegensec.pt'
