In [None]:
import pandas as pd
import numpy as np
import joblib
import json


In [None]:


# Load real and synthetic datasets
real_df = pd.read_csv("CICIDS2017_Multiclass_Merged.csv")
synthetic_df = pd.read_csv("synthetic_attacks.csv")

# ✅ If synthetic_df doesn't have 'Label', inject it manually
if 'Label' not in synthetic_df.columns:
    synthetic_df['Label'] = "SYNTHETIC_ATTACK"  # <— update this to correct label like "Heartbleed"

# Merge datasets
combined_df = pd.concat([real_df, synthetic_df], ignore_index=True)

# Use only numeric columns from real_df
feature_cols = [col for col in real_df.columns if col != 'Label' and pd.api.types.is_numeric_dtype(real_df[col])]

# Clean data
combined_df.replace([np.inf, -np.inf], np.nan, inplace=True)
combined_df.dropna(subset=feature_cols, inplace=True)

# Apply scaler
scaler = joblib.load("scaler.pkl")
combined_df_scaled = scaler.transform(combined_df[feature_cols])
scaled_df = pd.DataFrame(combined_df_scaled, columns=feature_cols)

# Load label encoder
with open("label_map.json", "r") as f:
    label_map = json.load(f)

# Encode labels
scaled_df["EncodedLabel"] = combined_df["Label"].map(label_map)
scaled_df["Label"] = combined_df["Label"]

# Save final dataset
scaled_df.to_csv("CICIDS2017_Full_With_Synthetic.csv", index=False)
print("✅ Final merged, scaled dataset is ready.")




✅ Final merged, scaled dataset is ready.


In [None]:
# 📘 meta_learning_fewshot_ids.ipynb

# 🧩 STEP 1: Load and Prepare the CICIDS Dataset
import pandas as pd
import numpy as np
import joblib
import json
import torch
from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# === Load Data ===
data = pd.read_csv("CICIDS2017_Multiclass_Merged.csv")

# ✅ Strip whitespace from column names
data.rename(columns=lambda x: x.strip(), inplace=True)
print("📋 Cleaned columns:", data.columns.tolist())

# === Load Scaler and Prepare Feature Columns ===
scaler = joblib.load("scaler.pkl")
feature_cols = [col for col in data.columns if col != 'Label' and pd.api.types.is_numeric_dtype(data[col])]

# ✅ Clean invalid values
data[feature_cols] = data[feature_cols].replace([np.inf, -np.inf], np.nan)
data.dropna(subset=feature_cols, inplace=True)

# === Apply Scaler ===
data_scaled = scaler.transform(data[feature_cols])

# === Load Label Map ===
with open("label_map.json", "r") as f:
    label_map = json.load(f)

inv_label_map = {v: k for k, v in label_map.items()}
data['EncodedLabel'] = data['Label'].map(label_map)

# === Final Dataset ===
data_final = pd.DataFrame(data_scaled, columns=feature_cols)
data_final['EncodedLabel'] = data['EncodedLabel']

# 🧩 STEP 2: Sample an N-way K-shot Episode
from collections import defaultdict

def sample_episode(df, n_way=5, k_shot=1, q_query=5):
    support_x, support_y, query_x, query_y = [], [], [], []
    required_count = k_shot + q_query

    eligible_classes = [cls for cls in df['EncodedLabel'].unique()
                        if len(df[df['EncodedLabel'] == cls]) >= required_count]

    if len(eligible_classes) < n_way:
        raise ValueError(f"Only {len(eligible_classes)} classes have at least {required_count} samples. "
                         f"Reduce n_way or k_shot/q_query.")

    selected_classes = np.random.choice(eligible_classes, size=n_way, replace=False)

    for cls in selected_classes:
        cls_df = df[df['EncodedLabel'] == cls]
        episode_samples = cls_df.sample(n=required_count)
        support = episode_samples.iloc[:k_shot]
        query = episode_samples.iloc[k_shot:]

        support_x.append(support[feature_cols].values)
        support_y.append(support['EncodedLabel'].values)
        query_x.append(query[feature_cols].values)
        query_y.append(query['EncodedLabel'].values)

    support_x = np.concatenate(support_x)
    support_y = np.concatenate(support_y)
    query_x = np.concatenate(query_x)
    query_y = np.concatenate(query_y)

    return support_x, support_y, query_x, query_y

# 🧪 Test Episode Sampling
s_x, s_y, q_x, q_y = sample_episode(data_final, n_way=5, k_shot=5, q_query=10)
print("✅ Support set shape:", s_x.shape, "Labels:", np.unique(s_y))
print("✅ Query set shape:", q_x.shape, "Labels:", np.unique(q_y))

# 🧠 STEP 3: Few-Shot Classifier with TabNet Only

def get_tabnet_embeddings(tabnet_model, data):
    import torch
    data_tensor = torch.from_numpy(data.astype(np.float32))
    embeddings = tabnet_model.network(data_tensor)[-1].detach().cpu().numpy()
    return embeddings

def plot_confusion(true, pred, label_map):
    cm = confusion_matrix(true, pred, labels=list(label_map.values()))
    plt.figure(figsize=(10, 6))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=label_map.keys(), yticklabels=label_map.keys(), cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.show()

def prototypical_loss_tabnet(support_x, support_y, query_x, query_y):
    clf = TabNetClassifier(device_name='cuda' if torch.cuda.is_available() else 'cpu')
    clf.fit(
        X_train=support_x, y_train=support_y,
        max_epochs=100,
        batch_size=1024,
        virtual_batch_size=128,
        drop_last=False
    )

    support_embeddings = get_tabnet_embeddings(clf, support_x)
    query_embeddings = get_tabnet_embeddings(clf, query_x)

    # Ensure support_y is a 1D array
    support_y = np.array(support_y).reshape(-1)
       centroids = []
    unique_classes = np.unique(support_y)
    support_y = np.array(support_y)
    support_embeddings = np.array(support_embeddings)

    for cls in unique_classes:
        mask = support_y == cls
        cls_embeds = support_embeddings[mask]
        if cls_embeds.ndim == 1:
            cls_embeds = np.expand_dims(cls_embeds, axis=0)
        centroids.append(np.mean(cls_embeds, axis=0))
    centroids = np.stack(centroids)


    dists = cosine_distances(query_embeddings, centroids)
    pred = np.argmin(dists, axis=1)
    true_label_indices = np.array([
        np.where(unique_classes == l)[0][0] if np.any(unique_classes == l) else -1
        for l in query_y
    ])
    acc = np.mean(pred == true_label_indices)

    if acc < 0.8:  # visualize only if not perfect
        pred_labels = [unique_classes[p] for p in pred]
        plot_confusion(query_y, pred_labels, inv_label_map)

    return acc

def run_fewshot_tabnet():
    support_x, support_y, query_x, query_y = sample_episode(data_final, n_way=5, k_shot=5, q_query=10)
    acc = prototypical_loss_tabnet(support_x, support_y, query_x, query_y)
    print("🧠 TabNet ProtoNet Accuracy:", acc)

# ▶️ Run inference
run_fewshot_tabnet()

# 🏋️ STEP 4: Meta-Training with TabNet

def meta_train_tabnet(episodes=30):
    accs = []
    for i in range(episodes):
        support_x, support_y, query_x, query_y = sample_episode(data_final, n_way=5, k_shot=5, q_query=10)
        acc = prototypical_loss_tabnet(support_x, support_y, query_x, query_y)
        accs.append(acc)
        if (i + 1) % 10 == 0:
            print(f"📈 Episode {i+1} - Avg Accuracy: {np.mean(accs):.4f}")

# ▶️ Train with TabNet
meta_train_tabnet(episodes=30)


📋 Cleaned columns: ['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Coun



✅ Support set shape: (25, 78) Labels: [ 0.  1.  4.  6. 10.]
✅ Query set shape: (50, 78) Labels: [ 0.  1.  4.  6. 10.]




epoch 0  | loss: 2.3663  |  0:00:00s
epoch 1  | loss: 2.06767 |  0:00:00s
epoch 2  | loss: 2.02956 |  0:00:00s
epoch 3  | loss: 1.73288 |  0:00:00s
epoch 4  | loss: 1.89751 |  0:00:00s
epoch 5  | loss: 1.76893 |  0:00:00s
epoch 6  | loss: 1.67846 |  0:00:00s
epoch 7  | loss: 1.63049 |  0:00:00s
epoch 8  | loss: 1.4161  |  0:00:00s
epoch 9  | loss: 1.25041 |  0:00:00s
epoch 10 | loss: 1.09086 |  0:00:00s
epoch 11 | loss: 1.03169 |  0:00:00s
epoch 12 | loss: 0.97242 |  0:00:00s
epoch 13 | loss: 0.9943  |  0:00:00s
epoch 14 | loss: 1.03721 |  0:00:00s
epoch 15 | loss: 0.98518 |  0:00:00s
epoch 16 | loss: 0.90107 |  0:00:00s
epoch 17 | loss: 0.85526 |  0:00:00s
epoch 18 | loss: 0.841   |  0:00:00s
epoch 19 | loss: 0.70317 |  0:00:00s
epoch 20 | loss: 0.63941 |  0:00:00s
epoch 21 | loss: 0.60581 |  0:00:00s
epoch 22 | loss: 0.52029 |  0:00:00s
epoch 23 | loss: 0.54488 |  0:00:00s
epoch 24 | loss: 0.59602 |  0:00:00s
epoch 25 | loss: 0.4893  |  0:00:00s
epoch 26 | loss: 0.44362 |  0:00:00s
e

IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed