In [6]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
class XSSDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [8]:
class MLPModel(nn.Module):
    def __init__(self, input_dim):
        super(MLPModel, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 2)
        )
    
    def forward(self, x):
        return self.layers(x)

def train_and_evaluate(model, train_loader, val_loader, optimizer, criterion, total_epochs):
    train_losses, val_losses = [], []
    best_val_loss = float('inf')
    patience = 5
    counter = 0
    
    for epoch in range(total_epochs):
        model.train()
        train_epoch_loss = 0
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_epoch_loss += loss.item()
        
        train_epoch_loss /= len(train_loader)
        train_losses.append(train_epoch_loss)
        
        model.eval()
        val_epoch_loss = 0
        with torch.no_grad():
            for features, labels in val_loader:
                outputs = model(features)
                loss = criterion(outputs, labels)
                val_epoch_loss += loss.item()
            
            val_epoch_loss /= len(val_loader)
            val_losses.append(val_epoch_loss)
        
        print(f'Epoch {epoch+1}/{total_epochs}: Train Loss = {train_epoch_loss:.4f}, Val Loss = {val_epoch_loss:.4f}')
        
        # Early stopping logic, but only after min 10 epochs
        if epoch >= 50:
            if val_epoch_loss < best_val_loss:
                best_val_loss = val_epoch_loss
                counter = 0
            else:
                counter += 1
            
            if counter >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs")
                break
    
    # Ensure we always have total_epochs data points for plotting
    while len(train_losses) < total_epochs:
        train_losses.append(train_losses[-1])
        val_losses.append(val_losses[-1])
    
    return train_losses[:total_epochs], val_losses[:total_epochs]

def evaluate_metrics(model, test_loader):
    model.eval()
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for features, labels in test_loader:
            outputs = model(features)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())
    
    f1 = f1_score(all_labels, all_preds, average='weighted')
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    
    return {
        'F1 Score': f1,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall
    }

def plot_loss_curves(learning_rates, all_train_losses, all_val_losses, total_epochs):
    # 1. Individual plots with x-axis ticks every 5 epochs
    for lr, train_losses, val_losses in zip(learning_rates, all_train_losses, all_val_losses):
        plt.figure(figsize=(12, 6))
        epochs = range(1, len(train_losses) + 1)
        plt.plot(epochs, train_losses, label='Train Loss', marker='o')
        plt.plot(epochs, val_losses, label='Validation Loss', marker='o')
        plt.title(f'Loss Curves - Learning Rate: {lr}')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True)
        
        # Set x-axis ticks every 5 epochs
        plt.xticks(range(0, total_epochs + 1, 5))
        plt.ylim(bottom=0)
        
        # Save individual plot
        plt.savefig(f'MLP_loss_plot_lr_{lr}.png')
        plt.close()

    # 2. Combined training losses plot
    plt.figure(figsize=(12, 6))
    for lr, train_losses in zip(learning_rates, all_train_losses):
        plt.plot(range(1, len(train_losses) + 1), train_losses, 
                label=f'LR = {lr}', marker='o', markersize=4)
    
    plt.title('Training Loss Comparison Across Learning Rates')
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.legend()
    plt.grid(True)
    plt.xticks(range(0, total_epochs + 1, 5))
    plt.ylim(bottom=0)
    plt.savefig('MLP_combined_training_losses.png')
    plt.close()

    # 3. Combined validation losses plot
    plt.figure(figsize=(12, 6))
    for lr, val_losses in zip(learning_rates, all_val_losses):
        plt.plot(range(1, len(val_losses) + 1), val_losses, 
                label=f'LR = {lr}', marker='o', markersize=4)
    
    plt.title('Validation Loss Comparison Across Learning Rates')
    plt.xlabel('Epoch')
    plt.ylabel('Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.xticks(range(0, total_epochs + 1, 5))
    plt.ylim(bottom=0)
    plt.savefig('MLP_combined_validation_losses.png')
    plt.close()


In [9]:
def main():
    
    # Load dataset
    dataset_path = '../Training Dataset/final_dataset.csv'
    
    # Read CSV and handle NaN values
    df = pd.read_csv(dataset_path)
    
    # Remove rows with NaN values in 'Sentence' or 'Label' columns
    df = df.dropna(subset=['Sentence', 'Label'])
    
    # Convert 'Sentence' to string type and replace any remaining NaNs
    df['Sentence'] = df['Sentence'].astype(str).fillna('')
    
    # Print dataset info
    print("Dataset shape after cleaning:", df.shape)
    print("\nSample of cleaned dataset:")
    print(df.head())
    
    # Text Vectorization
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(df['Sentence']).toarray()
    y = df['Label'].values
    
    # Split data
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)
    
    # Create data loaders
    train_dataset = XSSDataset(X_train, y_train)
    val_dataset = XSSDataset(X_val, y_val)
    test_dataset = XSSDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    test_loader = DataLoader(test_dataset, batch_size=32)
    
    # Print first 3 samples
    print("\nFirst 3 Training Samples:")
    for i, (features, label) in enumerate(train_loader):
        if i < 1:
            print("Features shape:", features[:3].shape)
            print("Labels:", label[:3])
        break
    
    # Learning rates to experiment
    learning_rates = [0.001, 0.002, 0.01, 0.02, 0.05]
    total_epochs = 50
    results = {}
    
    all_train_losses = []
    all_val_losses = []
    
    for lr in learning_rates:
        print(f"\n--- Learning Rate: {lr} ---")
        
        model = MLPModel(X_train.shape[1])
        optimizer = optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        
        # Train and evaluate
        train_losses, val_losses = train_and_evaluate(
            model, train_loader, val_loader, optimizer, criterion, total_epochs
        )
        
        # Store losses for later plotting
        all_train_losses.append(train_losses)
        all_val_losses.append(val_losses)
        
        # Save model weights
        model_path = f'MLP_model_lr_{lr}.pth'
        torch.save(model.state_dict(), model_path)
        
        # Evaluate metrics
        metrics = evaluate_metrics(model, test_loader)
        results[lr] = metrics
        print("Metrics:", metrics)
    
    # Plot all loss curves
    plot_loss_curves(learning_rates, all_train_losses, all_val_losses, total_epochs)
    
    # Print comprehensive results
    print("\n--- Comprehensive Results ---")
    for lr, metrics in results.items():
        print(f"\nLearning Rate: {lr}")
        for metric, value in metrics.items():
            print(f"{metric}: {value}")

In [10]:
if __name__ == "__main__":
    main()

Dataset shape after cleaning: (88309, 3)

Sample of cleaned dataset:
   Unnamed: 0                                           Sentence  Label
0           0  form.search_text=Dell%22%3E%3Cscript%3Ealert(/...      1
1           1         site=message&msg=<script>alert(1)</script>      1
2           2  Itemid=%22onmouseover=alert%28document.cookie%...      1
3           3  uilang=en%22%3E%3Cscript%3Ealert%28document.co...      1
4           4  msg=<ScRiPt>alert('LastRider-CyberBellona')</S...      1

First 3 Training Samples:
Features shape: torch.Size([3, 1000])
Labels: tensor([1, 1, 0])

--- Learning Rate: 0.001 ---
Epoch 1/50: Train Loss = 0.0631, Val Loss = 0.0260
Epoch 2/50: Train Loss = 0.0283, Val Loss = 0.0257
Epoch 3/50: Train Loss = 0.0268, Val Loss = 0.0260
Epoch 4/50: Train Loss = 0.0249, Val Loss = 0.0251
Epoch 5/50: Train Loss = 0.0237, Val Loss = 0.0254
Epoch 6/50: Train Loss = 0.0235, Val Loss = 0.0256
Epoch 7/50: Train Loss = 0.0228, Val Loss = 0.0275
Epoch 8/50: Train Los