In [6]:
import pandas as pd
from scapy.all import rdpcap, TCP, IP
import os

# === Function to extract features from pcap ===
def extract_flows_from_pcap(pcap_path, sublabel):
    packets = rdpcap(pcap_path)
    flows = {}
    timestamps = {}
    flag_counts = {}

    for pkt in packets:
        if IP in pkt and TCP in pkt:
            fid = (pkt[IP].src, pkt[IP].dst, pkt[TCP].sport, pkt[TCP].dport)
            if fid not in flows:
                flows[fid] = []
                timestamps[fid] = []
                flag_counts[fid] = {"SYN": 0, "ACK": 0, "RST": 0, "FIN": 0}
            flows[fid].append(len(pkt))
            timestamps[fid].append(pkt.time)

            flags = pkt[TCP].flags
            flag_counts[fid]["SYN"] += int(flags & 0x02 != 0)
            flag_counts[fid]["ACK"] += int(flags & 0x10 != 0)
            flag_counts[fid]["RST"] += int(flags & 0x04 != 0)
            flag_counts[fid]["FIN"] += int(flags & 0x01 != 0)

    records = []
    for fid, sizes in flows.items():
        times = timestamps[fid]
        if len(times) < 2:
            continue
        duration = max(times) - min(times)
        total_pkts = len(sizes)
        total_bytes = sum(sizes)
        min_size = min(sizes)
        max_size = max(sizes)
        mean_size = sum(sizes) / total_pkts

        fwd_pkts = bwd_pkts = total_pkts // 2  # approximation
        flags = flag_counts[fid]

        records.append([
            duration, total_pkts, total_bytes,
            min_size, max_size, mean_size,
            fwd_pkts, bwd_pkts,
            flags["SYN"], flags["ACK"], flags["RST"], flags["FIN"],
            sublabel
        ])

    return pd.DataFrame(records, columns=[
        "Duration", "TotalPackets", "TotalBytes",
        "MinPktSize", "MaxPktSize", "MeanPktSize",
        "FwdPkts", "BwdPkts",
        "SYN_Count", "ACK_Count", "RST_Count", "FIN_Count",
        "SubLabel"
    ])


# === File paths (update here as needed) ===
base_path = "/home/aadip/EdgeGenSec"  # or use absolute Windows path if local
pcaps = {
    "SYN": "portscan_syn.pcap",
    "FIN": "portscan_fin.pcap",
    "XMAS": "portscan_xmas.pcap"
}



# === Extract all new flows ===
df_all = []
for mode, pcap_file in pcaps.items():
    df = extract_flows_from_pcap(pcap_file, sublabel=mode)
    df_all.append(df)
    print(f"✅ Extracted: {mode} ({len(df)} flows)")
df_portscan_all = pd.concat(df_all, ignore_index=True)
df_portscan_all["Label"] = "PortScan"

# === Load existing base dataset (update path if needed) ===
df_base = pd.read_csv("CICIDS2017_Multiclass_Balanced_5k.csv")

# === Append and Save ===
# === Append and Save ===
df_final = pd.concat([df_base, df_portscan_all], ignore_index=True)

# ✅ Explicit path for Windows user
output_path = r"C:\Users\aadip\Desktop\internship\EdgeGenSec\CICIDS2017_Multiclass_Balanced_5k_with_portscan_variants.csv"
df_final.to_csv(output_path, index=False)

print(f"✅ Final dataset saved at: {output_path}")


✅ Extracted: SYN (11269 flows)
✅ Extracted: FIN (2 flows)
✅ Extracted: XMAS (11 flows)
✅ Final dataset saved at: C:\Users\aadip\Desktop\internship\EdgeGenSec\CICIDS2017_Multiclass_Balanced_5k_with_portscan_variants.csv


In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv("CICIDS2017_Multiclass_Balanced_5k_with_portscan_variants.csv")
print(df.head())
print(df.info())
print(df.isna().sum())
print(np.isinf(df.select_dtypes(include=[np.number])).sum())
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(thresh=int(0.9 * df.shape[1]))  # Keep rows with >=90% non-NaN




   Destination Port  Flow Duration  Total Fwd Packets  Total Backward Packets  \
0            1083.0           13.0                1.0                     1.0   
1              80.0      4306913.0                6.0                     0.0   
2              80.0      5137816.0                3.0                     1.0   
3              80.0      3980908.0                5.0                     0.0   
4              22.0     12851642.0               21.0                    32.0   

   Total Length of Fwd Packets  Total Length of Bwd Packets  \
0                          0.0                          6.0   
1                         36.0                          0.0   
2                          0.0                          0.0   
3                         30.0                          0.0   
4                       2008.0                       2745.0   

   Fwd Packet Length Max  Fwd Packet Length Min  Fwd Packet Length Mean  \
0                    0.0                    0.0            

  df = pd.read_csv("CICIDS2017_Multiclass_Balanced_5k_with_portscan_variants.csv")


In [3]:
import pandas as pd
import numpy as np

# === Load dataset ===
df = pd.read_csv("CICIDS2017_Multiclass_Balanced_5k_with_portscan_variants.csv", low_memory=False)

# === Keep only valid CICIDS-style rows (at least 70 non-NaN feature columns)
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(thresh=70)  # Require at least 70 non-NaN values to retain the row

# === Drop any custom extra columns (like 'Duration', 'SubLabel', etc.)
extra_cols = [col for col in df.columns if col not in df.columns[:78] and col != "Label"]
df = df.drop(columns=extra_cols)

# Final safety check
if df.empty:
    raise ValueError("❌ Dataframe is still empty after filtering. Please inspect source file.")
else:
    print(f"✅ Cleaned dataset shape: {df.shape}")
    print(f"🧪 Label distribution:\n{df['Label'].value_counts()}")


✅ Cleaned dataset shape: (47227, 79)
🧪 Label distribution:
Label
PortScan                      5000
DDoS                          5000
SSH-Patator                   5000
DoS Hulk                      5000
BENIGN                        5000
FTP-Patator                   5000
DoS GoldenEye                 5000
DoS slowloris                 5000
DoS Slowhttptest              5000
Web Attack � Brute Force      1507
Web Attack � XSS               652
Infiltration                    36
Web Attack � Sql Injection      21
Heartbleed                      11
Name: count, dtype: int64


In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
from sklearn.metrics import classification_report

# === Load dataset ===
# === Clean the dataset properly ===
df = pd.read_csv("CICIDS2017_Multiclass_Balanced_5k_with_portscan_variants.csv", low_memory=False)
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(thresh=70)

# Keep CICIDS-style 78 features + Label
columns_to_keep = list(df.columns[:78]) + ["Label"]
df = df[columns_to_keep]

# Drop columns with constant values or all zeros
df = df.loc[:, (df != df.iloc[0]).any()]  # removes constant columns
df = df.loc[:, (df != 0).any()]           # removes all-zero columns

# Drop any rows with remaining NaNs
df = df.dropna()

# Final check
if df.empty:
    raise ValueError("🚨 Cleaned DataFrame is empty. Check for over-filtering.")

# Separate features and labels
X = df.drop(columns=["Label"])
y = df["Label"]

# Clip outlier values (optional, stabilizes learning)
X = X.clip(lower=X.quantile(0.01), upper=X.quantile(0.99), axis=1)

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# Standardize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save preprocessors
import joblib
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
joblib.dump(list(X.columns), "features_list.pkl")

# Convert to tensors
import torch
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# CNN with dynamic flatten
import torch.nn as nn

class IDS_CNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(1, 32, 3, padding=1),
            nn.ReLU(),
            nn.Conv1d(32, 64, 3, padding=1),
            nn.ReLU()
        )
        with torch.no_grad():
            dummy = torch.zeros(1, 1, input_dim)
            self.flat_dim = self.conv(dummy).view(1, -1).shape[1]

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(self.flat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.classifier(self.conv(x))

# Train
model = IDS_CNN(X_train_tensor.shape[1], len(label_encoder.classes_))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

for epoch in range(1, 21):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_tensor.unsqueeze(1))
    loss = criterion(output, y_train_tensor)
    if torch.isnan(loss):
        print(f"❌ NaN loss at epoch {epoch} — aborting")
        break
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch}/20 - Loss: {loss.item():.4f}")

# Evaluate
model.eval()
with torch.no_grad():
    preds = model(X_test_tensor.unsqueeze(1))
    y_pred = torch.argmax(preds, dim=1)

from sklearn.metrics import classification_report
print("\n📊 Evaluation Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# TorchScript export
torch.jit.trace(model, X_test_tensor[:1].unsqueeze(1)).save("ids_cnn_multiclass.pt")
print("✅ Saved model as ids_cnn_multiclass.pt")


Epoch 1/20 - Loss: 2.6392
Epoch 2/20 - Loss: 2.4975
Epoch 3/20 - Loss: 2.3708
Epoch 4/20 - Loss: 2.2381
Epoch 5/20 - Loss: 2.1116
Epoch 6/20 - Loss: 1.9883
Epoch 7/20 - Loss: 1.8725
Epoch 8/20 - Loss: 1.7604
Epoch 9/20 - Loss: 1.6547
Epoch 10/20 - Loss: 1.5545
Epoch 11/20 - Loss: 1.4627
Epoch 12/20 - Loss: 1.3774
Epoch 13/20 - Loss: 1.2988
Epoch 14/20 - Loss: 1.2236
Epoch 15/20 - Loss: 1.1499
Epoch 16/20 - Loss: 1.0852
Epoch 17/20 - Loss: 1.0165
Epoch 18/20 - Loss: 0.9602
Epoch 19/20 - Loss: 0.9039
Epoch 20/20 - Loss: 0.8522

📊 Evaluation Report:
                            precision    recall  f1-score   support

                    BENIGN       0.99      0.55      0.71       999
                      DDoS       0.83      0.95      0.88      1000
             DoS GoldenEye       0.83      0.96      0.89      1000
                  DoS Hulk       0.89      0.81      0.85       997
          DoS Slowhttptest       0.91      0.82      0.86      1000
             DoS slowloris       0.88 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
import pandas as pd

# Load dataset with encoding fallback
df = pd.read_csv("CICIDS2017_Multiclass_Balanced_5k_with_portscan_variants.csv", encoding='utf-8', low_memory=False)

# Clean and sanitize
df.columns = df.columns.str.strip()
df['Label'] = df['Label'].astype(str).str.strip()

# Define label fragments to match using substring
rare_fragments = {
    "Heartbleed": "Heartbleed",
    "SQLi": "Sql Injection",
    "Infiltration": "Infiltration",
    "XSS": "XSS"
}

# Extract and save
for key, fragment in rare_fragments.items():
    subset = df[df["Label"].str.contains(fragment, case=False, na=False)]
    print(f"✅ {key}: {len(subset)} rows")
    subset.to_csv(f"RARE_{key}.csv", index=False)


✅ Heartbleed: 11 rows
✅ SQLi: 21 rows
✅ Infiltration: 36 rows
✅ XSS: 652 rows


In [17]:
# === Load and prepare Heartbleed data safely ===
df = pd.read_csv("RARE_Heartbleed.csv")
df = df.replace([np.inf, -np.inf], np.nan)

# Drop columns that are entirely NaN
df = df.dropna(axis=1, how='all')

# Fill remaining NaNs with median values (per column)
df = df.fillna(df.median(numeric_only=True))

# Drop non-numeric columns before scaling
X = df.select_dtypes(include=[np.number])

if X.empty:
	print("❌ No numeric columns available for scaling. Please check the input data.")
	X_scaled = np.array([])
	X_tensor = torch.tensor([])
else:
	scaler = StandardScaler()
	X_scaled = scaler.fit_transform(X)
	X_tensor = torch.tensor(X_scaled, dtype=torch.float32)


❌ No numeric columns available for scaling. Please check the input data.


In [15]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
import os

# === Load and prepare Heartbleed data ===
df = pd.read_csv("RARE_Heartbleed.csv")
df = df.replace([np.inf, -np.inf], np.nan).dropna()
X = df.drop(columns=["Label", "SubLabel"]) if "SubLabel" in df.columns else df.drop(columns=["Label"])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)

# === GAN model definition ===
latent_dim = 32
data_dim = X_tensor.shape[1]

class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, data_dim)
        )

    def forward(self, z):
        return self.model(z)

class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(data_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

G = Generator()
D = Discriminator()
criterion = nn.BCELoss()
g_opt = torch.optim.Adam(G.parameters(), lr=0.0005)
d_opt = torch.optim.Adam(D.parameters(), lr=0.0005)

# === Training loop ===
epochs = 3000
batch_size = min(32, len(X_tensor))
for epoch in range(epochs):
    idx = np.random.randint(0, len(X_tensor), batch_size)
    real_samples = X_tensor[idx]
    real_labels = torch.ones(batch_size, 1)
    fake_labels = torch.zeros(batch_size, 1)

    # Train Discriminator
    z = torch.randn(batch_size, latent_dim)
    fake_samples = G(z)
    d_real = D(real_samples)
    d_fake = D(fake_samples.detach())
    d_loss = criterion(d_real, real_labels) + criterion(d_fake, fake_labels)
    D.zero_grad()
    d_loss.backward()
    d_opt.step()

    # Train Generator
    z = torch.randn(batch_size, latent_dim)
    fake_samples = G(z)
    d_fake = D(fake_samples)
    g_loss = criterion(d_fake, real_labels)
    G.zero_grad()
    g_loss.backward()
    g_opt.step()

    if (epoch + 1) % 500 == 0:
        print(f"[{epoch+1}/{epochs}] D_loss: {d_loss.item():.4f}, G_loss: {g_loss.item():.4f}")

# === Generate synthetic samples ===
G.eval()
with torch.no_grad():
    z = torch.randn(100, latent_dim)
    synthetic = G(z).numpy()
    synthetic_unscaled = scaler.inverse_transform(synthetic)
    df_synth = pd.DataFrame(synthetic_unscaled, columns=X.columns)
    df_synth["Label"] = "Heartbleed"
    df_synth["SubLabel"] = "Synthetic"

df_synth.to_csv("SYN_Heartbleed.csv", index=False)
print("✅ Saved: SYN_Heartbleed.csv")


ValueError: Found array with 0 sample(s) (shape=(0, 90)) while a minimum of 1 is required by StandardScaler.