In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset


In [2]:

# === Step 1: Load and Sample the Dataset ===
INPUT_FILE = "Dataset.csv"
OUTPUT_FILE = "processed_data.csv"
SAMPLE_SIZE = 100_000
RANDOM_SEED = 42

print("[INFO] Loading dataset...")
df = pd.read_csv(INPUT_FILE)
df = df.sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED).reset_index(drop=True)

# === Step 2: Split IP Addresses into Octets ===
def split_ip_column(ip_series, prefix):
    octets = ip_series.str.split('.', expand=True).astype(float)
    octets.columns = [f"{prefix}_octet{i+1}" for i in range(4)]
    return octets

print("[INFO] Splitting IP addresses...")
df_orig_ip = split_ip_column(df["id.orig_h"], "orig_ip")
df_resp_ip = split_ip_column(df["id.resp_h"], "resp_ip")

df = pd.concat([df, df_orig_ip, df_resp_ip], axis=1)
df.drop(columns=["id.orig_h", "id.resp_h"], inplace=True)

# === Step 3: One-Hot Encode Categorical Features ===
print("[INFO] One-hot encoding categorical columns...")
categorical_cols = ["proto", "conn_state", "history", "label"]
df = pd.get_dummies(df, columns=categorical_cols)

# === Step 4: Scale Numerical Features ===
print("[INFO] Scaling numerical features...")
scaler = MinMaxScaler()

# Identify numeric columns (all except one-hot encoded)
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# === Step 5: Save the Processed Dataset ===
print(f"[INFO] Saving processed dataset to {OUTPUT_FILE}...")
df.to_csv(OUTPUT_FILE, index=False)
print("[SUCCESS] Preprocessing complete. File saved.")


[INFO] Loading dataset...
[INFO] Splitting IP addresses...
[INFO] One-hot encoding categorical columns...
[INFO] Scaling numerical features...
[INFO] Saving processed dataset to processed_data.csv...
[SUCCESS] Preprocessing complete. File saved.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Hyperparameters
latent_dim = 100
hidden_dim = 256
batch_size = 64
epochs = 300
learning_rate = 0.0002

# Model paths
MODEL_DIR = "saved_models"
os.makedirs(MODEL_DIR, exist_ok=True)
G_PATH = os.path.join(MODEL_DIR, "generator.pth")
D_PATH = os.path.join(MODEL_DIR, "discriminator.pth")

# Load and prepare your preprocessed data
print("Loading preprocessed data...")
data = pd.read_csv("processed_data.csv")
data_values = data.values.astype(np.float32)
tensor_data = torch.FloatTensor(data_values).to(device)

# Create dataset and dataloader
dataset = TensorDataset(tensor_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Generator Network
class Generator(nn.Module):
    def __init__(self, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim * 2, hidden_dim * 4),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim * 4, output_dim),
            nn.Tanh()
        )
    
    def forward(self, z):
        return self.model(z)

# Discriminator Network
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim * 4),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim * 4, hidden_dim * 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

# Initialize networks
output_dim = tensor_data.shape[1]
generator = Generator(output_dim).to(device)
discriminator = Discriminator(output_dim).to(device)

# Try to load existing models
try:
    generator.load_state_dict(torch.load(G_PATH))
    discriminator.load_state_dict(torch.load(D_PATH))
    print("Loaded existing models")
except:
    print("No saved models found, starting fresh")

# Loss function and optimizers
criterion = nn.BCELoss()
optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

# Training setup
losses_g = []
losses_d = []
best_loss = float('inf')
early_stop_patience = 20
patience_counter = 0

# Training loop with progress bar
try:
    print("Starting training... Press Ctrl+C to save and exit early")
    for epoch in tqdm(range(epochs), desc="Training GAN"):
        epoch_loss_g = 0
        epoch_loss_d = 0
        
        for i, real_data in enumerate(dataloader):
            real_data = real_data[0].to(device)
            batch_size = real_data.size(0)
            
            # Train Discriminator
            discriminator.zero_grad()
            real_labels = torch.ones(batch_size, 1).to(device)
            fake_labels = torch.zeros(batch_size, 1).to(device)
            
            # Real data
            outputs_real = discriminator(real_data)
            loss_real = criterion(outputs_real, real_labels)
            
            # Fake data
            noise = torch.randn(batch_size, latent_dim).to(device)
            fake_data = generator(noise)
            outputs_fake = discriminator(fake_data.detach())
            loss_fake = criterion(outputs_fake, fake_labels)
            
            loss_D = loss_real + loss_fake
            loss_D.backward()
            optimizer_D.step()
            
            # Train Generator
            generator.zero_grad()
            noise = torch.randn(batch_size, latent_dim).to(device)
            fake_data = generator(noise)
            outputs = discriminator(fake_data)
            loss_G = criterion(outputs, real_labels)
            loss_G.backward()
            optimizer_G.step()
            
            epoch_loss_g += loss_G.item()
            epoch_loss_d += loss_D.item()
        
        # Store average epoch losses
        avg_loss_g = epoch_loss_g / len(dataloader)
        avg_loss_d = epoch_loss_d / len(dataloader)
        losses_g.append(avg_loss_g)
        losses_d.append(avg_loss_d)
        
        # Early stopping check
        current_loss = avg_loss_g + avg_loss_d
        if current_loss < best_loss:
            best_loss = current_loss
            patience_counter = 0
            # Save best model
            torch.save(generator.state_dict(), G_PATH)
            torch.save(discriminator.state_dict(), D_PATH)
        else:
            patience_counter += 1
            if patience_counter >= early_stop_patience:
                print(f"\nEarly stopping at epoch {epoch+1}")
                break
        
        # Generate samples periodically
        if (epoch+1) % 50 == 0:
            torch.save(generator.state_dict(), G_PATH)
            torch.save(discriminator.state_dict(), D_PATH)
            print(f"\nEpoch {epoch+1} - Saved models")
            
            # Generate and show sample
            sample = generate_samples(1)
            print("Sample features:", sample.iloc[0, :5].to_dict())  # Show first 5 features

except KeyboardInterrupt:
    print("\nTraining interrupted by user")

# Final save
torch.save(generator.state_dict(), G_PATH)
torch.save(discriminator.state_dict(), D_PATH)
print("Models saved")

# Plot training progress
plt.figure(figsize=(10, 5))
plt.plot(losses_g, label='Generator Loss')
plt.plot(losses_d, label='Discriminator Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.savefig('training_loss.png')
print("Training plot saved to training_loss.png")

# Generate final samples
def generate_samples(num_samples):
    generator.eval()
    with torch.no_grad():
        noise = torch.randn(num_samples, latent_dim).to(device)
        synthetic_data = generator(noise).cpu().numpy()
    return pd.DataFrame(synthetic_data, columns=data.columns)

synthetic_traffic = generate_samples(1000)
synthetic_traffic.to_csv("synthetic_traffic.csv", index=False)
print("Generated samples saved to synthetic_traffic.csv")


Using device: cpu
Loading preprocessed data...
Starting training...
Epoch [50/500] Loss D: 1.1071, Loss G: 1.0009


KeyboardInterrupt: 