In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import csr_matrix

print("--- Environment Check ---")
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Current CUDA device:", torch.cuda.current_device())
print()


--- Environment Check ---
PyTorch version: 2.5.1+cu124
CUDA available: True
CUDA device count: 1
Current CUDA device: 0



In [16]:
df = pd.read_csv("../Training Dataset/final_dataset.csv")
df = df[df["Sentence"].apply(lambda x: isinstance(x, str))]

X = df["Sentence"].values
y = df["Label"].values

print("--- XSS Detection Model Demonstration ---")
print(f"Loaded {len(X)} samples successfully")
pos_samples = sum(y)
neg_samples = len(y) - pos_samples
print(f"Number of positive samples: {pos_samples}")
print(f"Number of negative samples: {neg_samples}")

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device.type, "\n")

print("Dataset splits:")
print(f"Training: {len(X_train)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples\n")


--- XSS Detection Model Demonstration ---
Loaded 88309 samples successfully
Number of positive samples: 50589
Number of negative samples: 37720
Using device: cuda 

Dataset splits:
Training: 61816 samples
Validation: 17750 samples
Test: 8743 samples



In [17]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec   = vectorizer.transform(X_val)
X_test_vec  = vectorizer.transform(X_test)

print("Showing first 3 training samples after TF-IDF:")
for i in range(min(3, X_train_vec.shape[0])):
    arr = X_train_vec[i].toarray().squeeze()
    print(f"Sample {i} shape: {arr.shape}, first 10 dims: {arr[:10]}")


Showing first 3 training samples after TF-IDF:
Sample 0 shape: (63340,), first 10 dims: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Sample 1 shape: (63340,), first 10 dims: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Sample 2 shape: (63340,), first 10 dims: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [18]:
class SparseDataset(Dataset):
    def __init__(self, X_sparse, y):
        self.X = X_sparse
        self.y = y
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        x_dense = self.X[idx].toarray().squeeze()
        return torch.tensor(x_dense, dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.long)

train_dataset = SparseDataset(X_train_vec, y_train)
val_dataset   = SparseDataset(X_val_vec,   y_val)
test_dataset  = SparseDataset(X_test_vec,  y_test)

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=512)
test_loader  = DataLoader(test_dataset,  batch_size=512)


In [19]:
class CNNModel(nn.Module):
    def __init__(self, input_dim):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(64, 32, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(64)
        self.bn2   = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(32 * input_dim, 2)

    def forward(self, x):
        # x shape: (batch, input_dim)
        x = x.unsqueeze(1)      # => (batch, 1, input_dim)
        x = self.conv1(x)       # => (batch, 64, input_dim)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x)       # => (batch, 32, input_dim)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)  # => (batch, 32*input_dim)
        return self.fc(x)


In [20]:
EPOCHS = 50
lrs = [0.001, 0.002, 0.01, 0.02, 0.05]

all_train_losses = {}
all_val_losses   = {}
final_results    = {}

def evaluate_model(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            loss = criterion(out, yb)
            total_loss += loss.item()
    return total_loss / len(loader)

input_dim = X_train_vec.shape[1]

import matplotlib.pyplot as plt

for lr in lrs:
    print(f"--- Learning Rate: {lr} ---")
    model = CNNModel(input_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    train_losses = []
    val_losses   = []

    for epoch in range(1, EPOCHS+1):
        model.train()
        total_train_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        avg_train = total_train_loss / len(train_loader)

        # val
        avg_val = evaluate_model(model, val_loader, criterion)
        train_losses.append(avg_train)
        val_losses.append(avg_val)

        print(f"Epoch {epoch}/{EPOCHS}: Train Loss = {avg_train:.4f}, Val Loss = {avg_val:.4f}")

    # plot
    all_train_losses[lr] = train_losses
    all_val_losses[lr] = val_losses

    plt.figure(figsize=(12,6))
    plt.plot(train_losses, label="Train Loss")
    plt.plot(val_losses,   label="Validation Loss")
    plt.title(f"CNN Loss (lr={lr})")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"CNN_loss_plot_lr_{lr}.png")
    plt.close()

    torch.save(model.state_dict(), f"CNN_model_lr_{lr}.pth")

    # evaluate on test
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            out = model(xb)
            preds = torch.argmax(out, dim=1).cpu().numpy()
            y_true.extend(yb.numpy())
            y_pred.extend(preds)

    from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
    f1  = f1_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    prec= precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    final_results[lr] = (f1, acc, prec, rec)

print()
# combine
plt.figure(figsize=(10,6))
for lr in lrs:
    plt.plot(all_train_losses[lr], label=f"Train lr={lr}")
plt.title("CNN Combined Training Losses")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.savefig("CNN_combined_training_losses.png")
plt.close()

plt.figure(figsize=(10,6))
for lr in lrs:
    plt.plot(all_val_losses[lr], label=f"Val lr={lr}")
plt.title("CNN Combined Validation Losses")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.savefig("CNN_combined_validation_losses.png")
plt.close()

print("--- Comprehensive Results ---")
for lr in lrs:
    f1, acc, prec, rec = final_results[lr]
    print(f"\nLearning Rate: {lr}")
    print(f"F1 Score: {f1}")
    print(f"Accuracy: {acc}")
    print(f"Precision: {prec}")
    print(f"Recall: {rec}")


--- Learning Rate: 0.001 ---
Epoch 1/50: Train Loss = 0.3379, Val Loss = 38.6169
Epoch 2/50: Train Loss = 0.0273, Val Loss = 24.8954
Epoch 3/50: Train Loss = 0.0128, Val Loss = 2.8826
Epoch 4/50: Train Loss = 0.0081, Val Loss = 60.0738
Epoch 5/50: Train Loss = 0.0068, Val Loss = 7.5743
Epoch 6/50: Train Loss = 0.0055, Val Loss = 60.3752
Epoch 7/50: Train Loss = 0.0058, Val Loss = 124.2072
Epoch 8/50: Train Loss = 0.0052, Val Loss = 32.1584
Epoch 9/50: Train Loss = 0.0060, Val Loss = 569.2309
Epoch 10/50: Train Loss = 0.0056, Val Loss = 45.5930
Epoch 11/50: Train Loss = 0.0044, Val Loss = 55.6179
Epoch 12/50: Train Loss = 0.0043, Val Loss = 62.3528
Epoch 13/50: Train Loss = 0.0044, Val Loss = 20.0623
Epoch 14/50: Train Loss = 0.0047, Val Loss = 15.8237
Epoch 15/50: Train Loss = 0.0046, Val Loss = 0.8801
Epoch 16/50: Train Loss = 0.0042, Val Loss = 718.8770
Epoch 17/50: Train Loss = 0.0046, Val Loss = 0.1450
Epoch 18/50: Train Loss = 0.0049, Val Loss = 146.8399
Epoch 19/50: Train Loss = 