In [2]:
import pandas as pd
import numpy as np
import joblib
import json


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random

# === Prototypical Network Backbone ===
class ProtoNet(nn.Module):
    def __init__(self, input_dim=78, embedding_dim=64):
        super(ProtoNet, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, embedding_dim)
        )

    def forward(self, x):
        return self.encoder(x)

# === Episode Sampling Function ===
def sample_episode(data_dict, n_way, k_shot, q_query):
    selected_classes = random.sample(list(data_dict.keys()), n_way)
    support_x, support_y = [], []
    query_x, query_y = [], []

    for i, cls in enumerate(selected_classes):
        samples = data_dict[cls]
        selected = random.sample(samples, k_shot + q_query)
        support = selected[:k_shot]
        query = selected[k_shot:]

        support_x.extend([x[0] for x in support])
        support_y.extend([i] * k_shot)
        query_x.extend([x[0] for x in query])
        query_y.extend([i] * q_query)

    return (
        torch.tensor(support_x, dtype=torch.float32),
        torch.tensor(support_y),
        torch.tensor(query_x, dtype=torch.float32),
        torch.tensor(query_y),
    )

# === Prototypical Loss ===
def prototypical_loss(model, support_x, support_y, query_x, query_y):
    embeddings_support = model(support_x)
    embeddings_query = model(query_x)

    prototypes = []
    for cls in torch.unique(support_y):
        cls_embeddings = embeddings_support[support_y == cls]
        prototypes.append(cls_embeddings.mean(0))
    prototypes = torch.stack(prototypes)

    dists = torch.cdist(embeddings_query, prototypes)
    log_p_y = F.log_softmax(-dists, dim=1)
    loss = F.nll_loss(log_p_y, query_y)
    acc = (log_p_y.argmax(1) == query_y).float().mean().item()
    return loss, acc

# === Training Loop ===
def train_fewshot(model, data_dict, episodes=200, n_way=5, k_shot=5, q_query=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for ep in range(episodes):
        support_x, support_y, query_x, query_y = sample_episode(data_dict, n_way, k_shot, q_query)
        loss, acc = prototypical_loss(model, support_x, support_y, query_x, query_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if ep % 10 == 0:
            print(f"[Ep {ep}] Loss: {loss.item():.4f} | Acc: {acc:.4f}")

# === 🔁 Example Usage (after loading your preprocessed data_dict) ===
# Format: data_dict = {'SQLi': [(feature_vec1,), (feature_vec2,), ...], 'XSS': [...], ...}
# Include both real and synthetic samples in each class list.
# train_fewshot(ProtoNet(), data_dict)


In [4]:
import pandas as pd
import os

input_path = "DATA/CICIDS2017_Full_With_Synthetic.csv"
output_dir = "DATA/RARE_CLASSES"
os.makedirs(output_dir, exist_ok=True)

rare_attacks = ["SQLi", "XSS", "Heartbleed", "Infiltration"]

# Load master dataset
df = pd.read_csv(input_path)

# Optional: if there's no 'Synthetic' column, infer from Label (e.g., "SQLi_SYN")
def is_synthetic(label):
    if isinstance(label, str):
        return "SYN" in label or "synthetic" in label.lower()
    return False


for attack in rare_attacks:
    # Ensure "Label" is string for string operations
    label_str = df["Label"].astype(str)

    # Real rows
    real_rows = df[(label_str == attack) & (~label_str.apply(is_synthetic))]
    # Synthetic rows (either different label or inferred)
    synthetic_rows = df[(label_str.str.contains(attack)) & (label_str.apply(is_synthetic))]

    # Save
    real_path = os.path.join(output_dir, f"{attack}_real.csv")
    syn_path = os.path.join(output_dir, f"{attack}_synthetic.csv")

    real_rows.drop(columns=["Label"], errors='ignore').to_csv(real_path, index=False)
    synthetic_rows.drop(columns=["Label"], errors='ignore').to_csv(syn_path, index=False)

    print(f"✅ {attack}: Saved {len(real_rows)} real and {len(synthetic_rows)} synthetic samples")


✅ SQLi: Saved 0 real and 0 synthetic samples
✅ XSS: Saved 0 real and 0 synthetic samples
✅ Heartbleed: Saved 0 real and 0 synthetic samples
✅ Infiltration: Saved 0 real and 0 synthetic samples


In [5]:
import pandas as pd
import numpy as np
import os

attack_classes = ["SQLi", "XSS", "Heartbleed", "Infiltration"]
base_dir = "DATA/RARE_CLASSES"  # change this to your actual folder

def load_attack_class_samples(cls):
    real_path = os.path.join(base_dir, f"{cls}_real.csv")
    syn_path = os.path.join(base_dir, f"{cls}_synthetic.csv")

    real_df = pd.read_csv(real_path)
    syn_df = pd.read_csv(syn_path)

    combined_df = pd.concat([real_df, syn_df])
    combined_df = combined_df.sample(frac=1).reset_index(drop=True)  # shuffle
    return [(row.values,) for _, row in combined_df.iterrows()]

# Build the dictionary
data_dict = {}
for attack in attack_classes:
    data_dict[attack] = load_attack_class_samples(attack)

# Confirm structure
for cls, samples in data_dict.items():
    print(f"{cls}: {len(samples)} samples")


SQLi: 0 samples
XSS: 0 samples
Heartbleed: 0 samples
Infiltration: 0 samples


In [7]:
df = pd.read_csv("DATA/CICIDS2017_Multiclass_Merged.csv")
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces from column names
print(df["Label"].value_counts())


Label
BENIGN                        2084030
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64


In [10]:
# Use the already loaded and cleaned df from previous cells

output_dir = "DATA/RARE_CLASSES"
os.makedirs(output_dir, exist_ok=True)

# Ensure column names are stripped of spaces
df.columns = df.columns.str.strip()

# === Normalize label column (some have special characters) ===
df["Label"] = df["Label"].astype(str).str.replace("�", "-", regex=False).str.strip()

# === Rare classes to extract ===
rare_classes = {
    "SQLi": "Web Attack - Sql Injection",
    "XSS": "Web Attack - XSS",
    "Heartbleed": "Heartbleed",
    "Infiltration": "Infiltration"
}

# === Extract and save each class ===
for short_name, full_label in rare_classes.items():
    class_df = df[df["Label"] == full_label]
    out_path = os.path.join(output_dir, f"{short_name}_real.csv")
    class_df.to_csv(out_path, index=False)
    print(f"✅ Saved {len(class_df)} samples to {out_path}")


✅ Saved 21 samples to DATA/RARE_CLASSES\SQLi_real.csv
✅ Saved 652 samples to DATA/RARE_CLASSES\XSS_real.csv
✅ Saved 11 samples to DATA/RARE_CLASSES\Heartbleed_real.csv
✅ Saved 36 samples to DATA/RARE_CLASSES\Infiltration_real.csv


In [13]:
import pandas as pd
import numpy as np
import random
import torch
import os

# 📂 Base directory where real + synthetic CSVs live
base_dir = "DATA/RARE_CLASSES"
attack_classes = ["SQLi", "XSS", "Heartbleed", "Infiltration"]

# 🔁 Load support + query samples for few-shot learning
def load_attack_class_samples(cls):
    real_path = os.path.join(base_dir, f"{cls}_real.csv")
    syn_path = os.path.join(base_dir, f"{cls}_synthetic.csv")

    real_df = pd.read_csv(real_path)
    syn_df = pd.read_csv(syn_path)

    # 🔀 Shuffle both
    real_df = real_df.sample(frac=1).reset_index(drop=True)
    syn_df = syn_df.sample(frac=1).reset_index(drop=True)

    return real_df, syn_df

# 🎯 Sample few-shot episode
def sample_episode(n_way=4, k_shot=5, q_query=10):
    support_set = []
    query_set = []
    support_labels = []
    query_labels = []

    label_map = {cls: i for i, cls in enumerate(attack_classes)}

    for cls in random.sample(attack_classes, n_way):
        real_df, syn_df = load_attack_class_samples(cls)

        # Ensure we have enough samples
        k_real = k_shot // 2
        k_syn = k_shot - k_real

        # Support set: half real + half synthetic
        support_real = real_df[:k_real]
        support_syn = syn_df[:k_syn]
        support_combined = pd.concat([support_real, support_syn])

        # Query set: all real
        query_real = real_df[k_real:k_real + q_query]

        # Append to episode
        support_set.append(support_combined)
        query_set.append(query_real)
        support_labels.extend([label_map[cls]] * k_shot)
        query_labels.extend([label_map[cls]] * q_query)

    # 🔄 Final tensors
    support_df = pd.concat(support_set).drop(columns=["Label"])
    query_df = pd.concat(query_set).drop(columns=["Label"])

    X_support = torch.tensor(support_df.values, dtype=torch.float32)
    y_support = torch.tensor(support_labels)
    X_query = torch.tensor(query_df.values, dtype=torch.float32)
    y_query = torch.tensor(query_labels)

    return X_support, y_support, X_query, y_query


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 🧠 Lightweight Encoder for 78D CICIDS-style features
class ProtoNetEncoder(nn.Module):
    def __init__(self, input_dim=78, hidden_dim=128, output_dim=64):
        super(ProtoNetEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.encoder(x)

# 🧪 Prototypical Loss Function
def prototypical_loss(X_support, y_support, X_query, y_query, encoder):
    # Encode
    z_support = encoder(X_support)
    z_query = encoder(X_query)

    # Get unique classes
    classes = torch.unique(y_support)
    n_classes = len(classes)

    # Compute prototypes (mean of support embeddings per class)
    prototypes = torch.stack([z_support[y_support == cls].mean(0) for cls in classes])

    # Compute squared Euclidean distance from query to prototypes
    dists = torch.cdist(z_query, prototypes, p=2)

    # Predicted labels = nearest prototype
    y_pred = dists.argmin(dim=1)

    # Compute accuracy
    acc = (y_pred == y_query).float().mean().item()

    return acc, y_pred


In [30]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler

# === 1. Load Real + Synthetic for Rare Classes ===
rare_classes = ["SQLi", "XSS", "Heartbleed", "Infiltration"]
dfs = []

for cls in rare_classes:
    for kind in ["real", "synthetic"]:
        path = f"DATA/RARE_CLASSES/{cls}_{kind}.csv"
        if os.path.exists(path):
            df_part = pd.read_csv(path)
            df_part.columns = [c.strip() for c in df_part.columns]
            df_part["Label"] = cls
            if df_part.shape[0] == 0:
                print(f"⚠️ Skipped empty file: {path}")
                continue
            print(f"✅ Loaded {path}: shape={df_part.shape}")
            dfs.append(df_part)
        else:
            print(f"❌ Missing file: {path}")

if not dfs:
    raise ValueError("❌ No data loaded. Ensure real and synthetic CSVs exist.")

df = pd.concat(dfs, ignore_index=True)
feature_cols = [col for col in df.columns if col != "Label"]

# === 2. Data Cleaning ===
for col in feature_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print(f"🔍 Shape before dropna: {df.shape}")
df.dropna(subset=feature_cols, inplace=True)
print(f"✅ Shape after dropna: {df.shape}")

# === 3. Encode Labels ===
label_map = {name: i for i, name in enumerate(sorted(df["Label"].unique()))}
df["Label"] = df["Label"].map(label_map)

# === 4. Scale Features ===
scaler = StandardScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols])

# === 5. Encoder with He + Orthogonal Init ===
class Encoder(nn.Module):
    def __init__(self, input_dim=78, hidden_dim=64):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu = nn.ReLU()
        self._init_weights()

    def _init_weights(self):
        nn.init.kaiming_normal_(self.fc1.weight)
        nn.init.orthogonal_(self.fc2.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.zeros_(self.fc2.bias)

    def forward(self, x):
        return self.relu(self.fc2(self.relu(self.fc1(x))))

# === 6. Few-Shot Episode Sampler ===
def sample_episode(df, n_way=4, k_shot=5, q_query=10):
    support_x, support_y, query_x, query_y = [], [], [], []
    selected_classes = sorted(df["Label"].unique())[:n_way]

    for cls in selected_classes:
        class_df = df[df["Label"] == cls]
        n_samples = k_shot + q_query
        replace = len(class_df) < n_samples
        samples = class_df.sample(n=n_samples, replace=replace)
        support = samples.iloc[:k_shot]
        query = samples.iloc[k_shot:]

        support_x.append(support[feature_cols].values)
        support_y.append([cls] * k_shot)
        query_x.append(query[feature_cols].values)
        query_y.append([cls] * q_query)

    return (
        torch.tensor(np.vstack(support_x), dtype=torch.float32),
        torch.tensor(np.concatenate(support_y), dtype=torch.long),
        torch.tensor(np.vstack(query_x), dtype=torch.float32),
        torch.tensor(np.concatenate(query_y), dtype=torch.long),
    )

# === 7. ProtoNet Loss ===
def prototypical_loss(encoder, support_x, support_y, query_x, query_y):
    emb_support = encoder(support_x)
    emb_query = encoder(query_x)
    prototypes = torch.stack([emb_support[support_y == c].mean(0) for c in torch.unique(support_y)])
    logits = -torch.cdist(emb_query, prototypes)
    loss = F.cross_entropy(logits, query_y)
    acc = (logits.argmax(dim=1) == query_y).float().mean().item()
    return loss, acc

# === 8. Train ProtoNet ===
encoder = Encoder(input_dim=len(feature_cols))
optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)

for episode in range(100):
    support_x, support_y, query_x, query_y = sample_episode(df)
    optimizer.zero_grad()
    loss, acc = prototypical_loss(encoder, support_x, support_y, query_x, query_y)
    if torch.isnan(loss):
        print(f"[Episode {episode}] ⚠️ NaN Loss – skipping.")
        continue
    loss.backward()
    optimizer.step()
    if episode % 10 == 0:
        print(f"[Episode {episode}] Accuracy: {acc:.4f} | Loss: {loss.item():.4f}")

# === 9. Save Encoder ===
torch.save(encoder.state_dict(), "protonet_encoder.pt")
print("✅ Saved encoder as protonet_encoder.pt")


✅ Loaded DATA/RARE_CLASSES/SQLi_real.csv: shape=(21, 79)
✅ Loaded DATA/RARE_CLASSES/SQLi_synthetic.csv: shape=(1000, 79)
✅ Loaded DATA/RARE_CLASSES/XSS_real.csv: shape=(652, 79)
✅ Loaded DATA/RARE_CLASSES/XSS_synthetic.csv: shape=(1000, 79)
✅ Loaded DATA/RARE_CLASSES/Heartbleed_real.csv: shape=(11, 79)
✅ Loaded DATA/RARE_CLASSES/Heartbleed_synthetic.csv: shape=(1000, 79)
✅ Loaded DATA/RARE_CLASSES/Infiltration_real.csv: shape=(36, 79)
✅ Loaded DATA/RARE_CLASSES/Infiltration_synthetic.csv: shape=(1000, 79)
🔍 Shape before dropna: (4720, 79)
✅ Shape after dropna: (4720, 79)
[Episode 0] Accuracy: 0.7250 | Loss: 2.2058
[Episode 10] Accuracy: 1.0000 | Loss: 0.0789
[Episode 20] Accuracy: 1.0000 | Loss: 0.0144
[Episode 30] Accuracy: 0.9750 | Loss: 0.0839
[Episode 40] Accuracy: 1.0000 | Loss: 0.0023
[Episode 50] Accuracy: 1.0000 | Loss: 0.0029
[Episode 60] Accuracy: 1.0000 | Loss: 0.0020
[Episode 70] Accuracy: 1.0000 | Loss: 0.0013
[Episode 80] Accuracy: 1.0000 | Loss: 0.0015
[Episode 90] Accur

In [29]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# === CONFIG ===
CLASS_NAME = "SQLi"  # Change to "XSS", "Heartbleed", "Infiltration" as needed
REAL_CSV = f"DATA/RARE_CLASSES/{CLASS_NAME}_real.csv"
SYN_CSV = f"DATA/RARE_CLASSES/{CLASS_NAME}_synthetic.csv"
EPOCHS = 5000
LATENT_DIM = 32
BATCH_SIZE = 16

# === 1. Load and Normalize Real Data ===
df = pd.read_csv(REAL_CSV)
feature_cols = [c for c in df.columns if c != "Label"]
scaler = StandardScaler()
X = scaler.fit_transform(df[feature_cols])
X = torch.tensor(X, dtype=torch.float32)

# === 2. Define Generator & Discriminator ===
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, z):
        return self.model(z)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# === 3. Init Models ===
G = Generator(LATENT_DIM, X.shape[1])
D = Discriminator(X.shape[1])
g_opt = optim.Adam(G.parameters(), lr=0.001)
d_opt = optim.Adam(D.parameters(), lr=0.001)
loss_fn = nn.BCELoss()

# === 4. Training Loop ===
for epoch in range(EPOCHS):
    # === Train Discriminator ===
    real_idx = torch.randint(0, X.shape[0], (BATCH_SIZE,))
    real_samples = X[real_idx]
    z = torch.randn(BATCH_SIZE, LATENT_DIM)
    fake_samples = G(z).detach()
    
    d_real = D(real_samples)
    d_fake = D(fake_samples)

    d_loss = loss_fn(d_real, torch.ones_like(d_real)) + loss_fn(d_fake, torch.zeros_like(d_fake))
    d_opt.zero_grad()
    d_loss.backward()
    d_opt.step()

    # === Train Generator ===
    z = torch.randn(BATCH_SIZE, LATENT_DIM)
    gen_samples = G(z)
    d_gen = D(gen_samples)
    g_loss = loss_fn(d_gen, torch.ones_like(d_gen))
    g_opt.zero_grad()
    g_loss.backward()
    g_opt.step()

    if epoch % 500 == 0:
        print(f"[{epoch}] D_loss: {d_loss.item():.4f} | G_loss: {g_loss.item():.4f}")

# === 5. Generate and Save Synthetic Samples ===
num_samples = 1000  # Or match the real count if very small
with torch.no_grad():
    z = torch.randn(num_samples, LATENT_DIM)
    syn_samples = G(z).numpy()

# Inverse scale before saving
syn_df = pd.DataFrame(scaler.inverse_transform(syn_samples), columns=feature_cols)
syn_df.to_csv(SYN_CSV, index=False)
print(f"✅ Saved {num_samples} synthetic samples to {SYN_CSV}")


[0] D_loss: 1.4073 | G_loss: 0.7243
[500] D_loss: 0.7796 | G_loss: 1.3943
[1000] D_loss: 0.6397 | G_loss: 1.8202
[1500] D_loss: 0.7552 | G_loss: 1.3358
[2000] D_loss: 0.8220 | G_loss: 1.2485
[2500] D_loss: 0.7672 | G_loss: 1.2269
[3000] D_loss: 0.8158 | G_loss: 1.6233
[3500] D_loss: 0.4855 | G_loss: 1.3848
[4000] D_loss: 0.5399 | G_loss: 1.4846
[4500] D_loss: 0.6570 | G_loss: 1.7283
✅ Saved 1000 synthetic samples to DATA/RARE_CLASSES/SQLi_synthetic.csv


In [1]:
import os
import pandas as pd
import numpy as np
import torch

# === CONFIG ===
rare_classes = ["SQLi", "XSS", "Heartbleed", "Infiltration"]
support_real_per_class = 5
support_synthetic_per_class = 5

real_dir = "DATA/RARE_CLASSES"
synthetic_dir = "DATA/RARE_CLASSES"

support_x = []
support_y = []
label_map = {cls: i for i, cls in enumerate(rare_classes)}

for cls in rare_classes:
    label_id = label_map[cls]

    # Load real samples
    real_path = os.path.join(real_dir, f"{cls}_real.csv")
    if os.path.exists(real_path):
        df_real = pd.read_csv(real_path).dropna()
        df_real["Label"] = label_id
        df_real = df_real.sample(n=min(support_real_per_class, len(df_real)), random_state=42)
        support_x.append(df_real.drop(columns=["Label"]).values)
        support_y.append([label_id] * len(df_real))

    # Load synthetic samples
    syn_path = os.path.join(synthetic_dir, f"{cls}_synthetic.csv")
    if os.path.exists(syn_path):
        df_syn = pd.read_csv(syn_path).dropna()
        df_syn["Label"] = label_id
        df_syn = df_syn.sample(n=min(support_synthetic_per_class, len(df_syn)), random_state=42)
        support_x.append(df_syn.drop(columns=["Label"]).values)
        support_y.append([label_id] * len(df_syn))

# Stack and save
support_x = torch.tensor(np.vstack(support_x), dtype=torch.float32)
support_y = torch.tensor(np.concatenate(support_y), dtype=torch.long)

torch.save(support_x, "DATA/RARE_CLASSES/support_x.pt")
torch.save(support_y, "DATA/RARE_CLASSES/support_y.pt")
print("✅ Support set regenerated and saved.")


✅ Support set regenerated and saved.
