In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re
from typing import List, Tuple
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import os
import numpy as np

In [2]:
class TextPreprocessor:
    def __init__(self, max_len: int = 100):
        self.max_len = max_len
        self.vocab = {'<PAD>': 0, '<UNK>': 1}
        self.vocab_size = 2
    
    def tokenize(self, text: str) -> List[str]:
        # Convert input to string and clean it
        text = str(text)
        text = re.sub(r'([<>/="])', r' \1 ', text)
        text = ' '.join(text.split())
        return text.lower().split()
    
    def build_vocab(self, texts: List[str], min_freq: int = 2):
        counter = Counter()
        for text in texts:
            # Ensure text is string
            text = str(text)
            tokens = self.tokenize(text)
            counter.update(tokens)
        
        for word, freq in counter.items():
            if freq >= min_freq and word not in self.vocab:
                self.vocab[word] = self.vocab_size
                self.vocab_size += 1
    
    def encode_text(self, text: str) -> List[int]:
        # Ensure text is string
        text = str(text)
        tokens = self.tokenize(text)
        if len(tokens) > self.max_len:
            tokens = tokens[:self.max_len]
        else:
            tokens.extend(['<PAD>'] * (self.max_len - len(tokens)))
        return [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]


In [3]:
class XSSDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[int], preprocessor: TextPreprocessor):
        # Convert all texts to strings
        self.texts = [str(text) for text in texts]
        self.preprocessor = preprocessor
        self.encodings = [self.preprocessor.encode_text(text) for text in self.texts]
        self.labels = [int(label) for label in labels]  # Convert labels to int

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]:
        return (torch.tensor(self.encodings[idx], dtype=torch.long),
                torch.tensor(self.labels[idx], dtype=torch.float))

def load_and_clean_data(file_path: str) -> Tuple[List[str], List[int]]:
    """Load and clean the dataset, ensuring proper data types."""
    try:
        # Read the CSV file
        data = pd.read_csv(file_path)
        
        # Convert texts to strings and clean them
        texts = [str(text).strip() for text in data['Sentence']]
        
        # Convert labels to integers
        labels = [int(label) for label in data['Label']]
        
        # Basic validation
        assert len(texts) == len(labels), "Number of texts and labels must match"
        assert all(isinstance(text, str) for text in texts), "All texts must be strings"
        assert all(isinstance(label, int) and label in [0, 1] for label in labels), "Labels must be binary (0 or 1)"
        
        print(f"Loaded {len(texts)} samples successfully")
        
        # Print some basic statistics
        print(f"Number of positive samples: {sum(labels)}")
        print(f"Number of negative samples: {len(labels) - sum(labels)}")
        
        return texts, labels
    
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        raise

In [4]:
class XSSDetectorLSTM(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int = 50, 
                 hidden_dim: int = 64, num_layers: int = 2, dropout: float = 0.3):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=True
        )
        self.fc1 = nn.Linear(hidden_dim * 2, 32)
        self.fc2 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        out = self.dropout(hidden)
        out = torch.relu(self.fc1(out))
        out = torch.sigmoid(self.fc2(out))
        return out


In [5]:
class XSSDetector:
    def __init__(self, max_len: int = 100, device: str = None):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else torch.device(device)
        print(f"Using device: {self.device}")
        
        self.max_len = max_len
        self.preprocessor = TextPreprocessor(max_len)
        self.model = None
        self.results = {}

    def plot_loss_curves(self):
        """Plot loss curve visualizations comparing different learning rates."""
        # Individual plots for each learning rate
        for lr in self.results.keys():
            plt.figure(figsize=(12, 6))
            train_losses = self.results[lr]['train_losses']
            val_losses = self.results[lr]['val_losses']
            epochs = range(1, len(train_losses) + 1)
            
            plt.plot(epochs, train_losses, label='Training Loss', marker='o', markersize=4)
            plt.plot(epochs, val_losses, label='Validation Loss', marker='o', markersize=4)
            
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            plt.title(f'Training and Validation Loss (Learning Rate: {lr})')
            plt.xticks(np.arange(0, len(train_losses) + 1, 5))
            plt.grid(True)
            plt.legend()
            plt.tight_layout()
            plt.savefig(f'LSTM_loss_plot_lr_{lr}.png')
            plt.close()
        
        # Combined training losses plot
        plt.figure(figsize=(12, 6))
        for lr in self.results.keys():
            train_losses = self.results[lr]['train_losses']
            epochs = range(1, len(train_losses) + 1)
            plt.plot(epochs, train_losses, label=f'LR = {lr}', marker='o', markersize=4)
        
        plt.xlabel('Epoch')
        plt.ylabel('Training Loss')
        plt.title('Training Loss Comparison Across Learning Rates')
        plt.xticks(np.arange(0, max(len(self.results[lr]['train_losses']) for lr in self.results.keys()) + 1, 5))
        plt.grid(True)
        plt.legend()
        plt.ylim(bottom=0)
        plt.tight_layout()
        plt.savefig('LSTM_combined_training_losses.png')
        plt.close()
        
        # Combined validation losses plot
        plt.figure(figsize=(12, 6))
        for lr in self.results.keys():
            val_losses = self.results[lr]['val_losses']
            epochs = range(1, len(val_losses) + 1)
            plt.plot(epochs, val_losses, label=f'LR = {lr}', marker='o', markersize=4)
        
        plt.xlabel('Epoch')
        plt.ylabel('Validation Loss')
        plt.title('Validation Loss Comparison Across Learning Rates')
        plt.xticks(np.arange(0, max(len(self.results[lr]['val_losses']) for lr in self.results.keys()) + 1, 5))
        plt.grid(True)
        plt.legend()
        plt.ylim(bottom=0)
        plt.tight_layout()
        plt.savefig('LSTM_combined_validation_losses.png')
        plt.close()

    def train(self, texts: List[str], labels: List[int], 
              epochs: int = 50, batch_size: int = 16,  
              learning_rates: List[float] = [0.001, 0.002, 0.01, 0.02, 0.05]):
        try:
            texts = [str(text) for text in texts]
            labels = torch.tensor(labels, dtype=torch.float)
            
            self.preprocessor.build_vocab(texts)
            dataset = XSSDataset(texts, labels.numpy(), self.preprocessor)
            
            # Split dataset
            train_size = int(0.7 * len(dataset))
            val_size = int(0.2 * len(dataset))
            test_size = len(dataset) - train_size - val_size
            
            train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
                dataset, [train_size, val_size, test_size]
            )
            
            print(f"\nDataset splits:")
            print(f"Training: {train_size} samples")
            print(f"Validation: {val_size} samples")
            print(f"Test: {test_size} samples")
            
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size)
            
            # Import metrics calculation functions
            from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
            
            # Train with different learning rates
            for lr in learning_rates:
                print(f"\n--- Learning Rate: {lr} ---")
                
                self.model = XSSDetectorLSTM(
                    vocab_size=self.preprocessor.vocab_size,
                    embedding_dim=50,
                    dropout=0.3
                ).to(self.device)
                
                optimizer = torch.optim.Adam(self.model.parameters(), lr=lr, weight_decay=1e-5)
                criterion = nn.BCELoss()
                
                # Learning rate scheduler for better convergence
                scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                    optimizer, mode='min', factor=0.5, patience=3, verbose=False
                )
                
                train_losses = []
                val_losses = []
                
                best_val_loss = float('inf')
                patience_counter = 0
                
                for epoch in range(epochs):
                    # Training
                    self.model.train()
                    total_loss = 0
                    train_correct = 0
                    train_total = 0
                    
                    for batch_sequences, batch_labels in train_loader:
                        batch_sequences = batch_sequences.to(self.device)
                        batch_labels = batch_labels.to(self.device)
                        
                        optimizer.zero_grad()
                        outputs = self.model(batch_sequences).squeeze()
                        loss = criterion(outputs, batch_labels)
                        loss.backward()
                        optimizer.step()
                        
                        total_loss += loss.item()
                        predictions = (outputs >= 0.5).float()
                        train_correct += (predictions == batch_labels).sum().item()
                        train_total += len(batch_labels)
                    
                    avg_train_loss = total_loss / len(train_loader)
                    train_losses.append(avg_train_loss)
                    
                    # Validation
                    self.model.eval()
                    val_loss = 0
                    val_predictions = []
                    val_true_labels = []
                    
                    with torch.no_grad():
                        for batch_sequences, batch_labels in val_loader:
                            batch_sequences = batch_sequences.to(self.device)
                            batch_labels = batch_labels.to(self.device)
                            
                            outputs = self.model(batch_sequences).squeeze()
                            batch_val_loss = criterion(outputs, batch_labels).item()
                            val_loss += batch_val_loss
                            
                            predictions = (outputs >= 0.5).float()
                            val_predictions.extend(predictions.cpu().numpy())
                            val_true_labels.extend(batch_labels.cpu().numpy())
                    
                    avg_val_loss = val_loss / len(val_loader)
                    val_losses.append(avg_val_loss)
                    
                    print(f"Epoch {epoch+1}/{epochs}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")
                    
                    # Learning rate scheduler step
                    scheduler.step(avg_val_loss)
                    
                    # Early stopping
                    if avg_val_loss < best_val_loss:
                        best_val_loss = avg_val_loss
                        patience_counter = 0
                        # Save best model
                        torch.save({
                            'model_state_dict': self.model.state_dict(),
                            'preprocessor': self.preprocessor
                        }, f'LSTM_model_lr_{lr}.pth')
                    else:
                        patience_counter += 1
                    
                    if patience_counter > 50:
                        print("Early stopping triggered")
                        break
                
                # Calculate final metrics
                val_predictions = np.array(val_predictions)
                val_true_labels = np.array(val_true_labels)
                
                f1 = f1_score(val_true_labels, val_predictions)
                accuracy = accuracy_score(val_true_labels, val_predictions)
                precision = precision_score(val_true_labels, val_predictions)
                recall = recall_score(val_true_labels, val_predictions)
                
                # Store results
                self.results[lr] = {
                    'train_losses': train_losses,
                    'val_losses': val_losses,
                    'f1_score': f1,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall
                }
            
            # Print comprehensive results
            print("\n--- Comprehensive Results ---")
            for lr in learning_rates:
                r = self.results[lr]
                print(f"\nLearning Rate: {lr}")
                print(f"F1 Score: {r['f1_score']:.16f}")
                print(f"Accuracy: {r['accuracy']:.16f}")
                print(f"Precision: {r['precision']:.16f}")
                print(f"Recall: {r['recall']:.16f}")
            
            # Plot all loss curves
            self.plot_loss_curves()
            
        except Exception as e:
            import traceback
            print(f"Training error: {e}")
            traceback.print_exc()

In [6]:
# Pre-execution environment check
def check_environment():
    print("\n--- Environment Check ---")
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"CUDA device count: {torch.cuda.device_count()}")
    try:
        print(f"Current CUDA device: {torch.cuda.current_device()}")
    except:
        print("No CUDA device currently selected")

def demo_detector(dataset_path='../Training Dataset/final_dataset.csv'):
    print("\n--- XSS Detection Model Demonstration ---")
    
    try:
        # Load and clean data
        texts, labels = load_and_clean_data(dataset_path)
        
        # Initialize and train detector
        detector = XSSDetector(max_len=100)
        detector.train(
            texts=texts,
            labels=labels,
            epochs=50,
            batch_size=16,
            learning_rates=[0.001, 0.002, 0.01, 0.02, 0.05]
        )
    
    except Exception as e:
        print(f"Demonstration failed: {e}")
        print("Possible issues:")
        print("1. Ensure correct dataset path")
        print("2. Check dataset format")
        print("3. Verify required libraries are installed")

In [7]:
if __name__ == "__main__":
    check_environment()
    demo_detector()


--- Environment Check ---
PyTorch version: 2.5.1+cu124
CUDA available: True
CUDA device count: 1
Current CUDA device: 0

--- XSS Detection Model Demonstration ---
Loaded 88310 samples successfully
Number of positive samples: 50590
Number of negative samples: 37720
Using device: cuda

Dataset splits:
Training: 61816 samples
Validation: 17662 samples
Test: 8832 samples

--- Learning Rate: 0.001 ---




Epoch 1/50: Train Loss = 0.0811, Val Loss = 0.0529
Epoch 2/50: Train Loss = 0.0436, Val Loss = 0.0460
Epoch 3/50: Train Loss = 0.0332, Val Loss = 0.0404
Epoch 4/50: Train Loss = 0.0290, Val Loss = 0.0418
Epoch 5/50: Train Loss = 0.0268, Val Loss = 0.0411
Epoch 6/50: Train Loss = 0.0275, Val Loss = 0.0397
Epoch 7/50: Train Loss = 0.0269, Val Loss = 0.0406
Epoch 8/50: Train Loss = 0.0255, Val Loss = 0.0386
Epoch 9/50: Train Loss = 0.0240, Val Loss = 0.0446
Epoch 10/50: Train Loss = 0.0250, Val Loss = 0.0451
Epoch 11/50: Train Loss = 0.0264, Val Loss = 0.0654
Epoch 12/50: Train Loss = 0.0240, Val Loss = 0.0493
Epoch 13/50: Train Loss = 0.0202, Val Loss = 0.0451
Epoch 14/50: Train Loss = 0.0206, Val Loss = 0.0405
Epoch 15/50: Train Loss = 0.0216, Val Loss = 0.0424
Epoch 16/50: Train Loss = 0.0214, Val Loss = 0.0433
Epoch 17/50: Train Loss = 0.0188, Val Loss = 0.0407
Epoch 18/50: Train Loss = 0.0187, Val Loss = 0.0423
Epoch 19/50: Train Loss = 0.0185, Val Loss = 0.0454
Epoch 20/50: Train Lo



Epoch 1/50: Train Loss = 0.0709, Val Loss = 0.0478
Epoch 2/50: Train Loss = 0.0390, Val Loss = 0.0406
Epoch 3/50: Train Loss = 0.0325, Val Loss = 0.0393
Epoch 4/50: Train Loss = 0.0307, Val Loss = 0.0426
Epoch 5/50: Train Loss = 0.0288, Val Loss = 0.0470
Epoch 6/50: Train Loss = 0.0387, Val Loss = 0.0446
Epoch 7/50: Train Loss = 0.0282, Val Loss = 0.0383
Epoch 8/50: Train Loss = 0.0269, Val Loss = 0.0418
Epoch 9/50: Train Loss = 0.0279, Val Loss = 0.0396
Epoch 10/50: Train Loss = 0.0255, Val Loss = 0.0451
Epoch 11/50: Train Loss = 0.0270, Val Loss = 0.0399
Epoch 12/50: Train Loss = 0.0228, Val Loss = 0.0452
Epoch 13/50: Train Loss = 0.0227, Val Loss = 0.0438
Epoch 14/50: Train Loss = 0.0230, Val Loss = 0.0413
Epoch 15/50: Train Loss = 0.0230, Val Loss = 0.0410
Epoch 16/50: Train Loss = 0.0198, Val Loss = 0.0432
Epoch 17/50: Train Loss = 0.0200, Val Loss = 0.0431
Epoch 18/50: Train Loss = 0.0205, Val Loss = 0.0440
Epoch 19/50: Train Loss = 0.0206, Val Loss = 0.0434
Epoch 20/50: Train Lo



Epoch 1/50: Train Loss = 0.0884, Val Loss = 0.0574
Epoch 2/50: Train Loss = 0.0554, Val Loss = 0.0539
Epoch 3/50: Train Loss = 0.0530, Val Loss = 0.0465
Epoch 4/50: Train Loss = 0.0508, Val Loss = 0.1356
Epoch 5/50: Train Loss = 0.0543, Val Loss = 0.0513
Epoch 6/50: Train Loss = 0.0511, Val Loss = 0.0482
Epoch 7/50: Train Loss = 0.0495, Val Loss = 0.0468
Epoch 8/50: Train Loss = 0.0367, Val Loss = 0.0426
Epoch 9/50: Train Loss = 0.0347, Val Loss = 0.0434
Epoch 10/50: Train Loss = 0.0358, Val Loss = 0.0427
Epoch 11/50: Train Loss = 0.0345, Val Loss = 0.0456
Epoch 12/50: Train Loss = 0.0354, Val Loss = 0.0474
Epoch 13/50: Train Loss = 0.0285, Val Loss = 0.0440
Epoch 14/50: Train Loss = 0.0286, Val Loss = 0.0457
Epoch 15/50: Train Loss = 0.0276, Val Loss = 0.0499
Epoch 16/50: Train Loss = 0.0275, Val Loss = 0.0408
Epoch 17/50: Train Loss = 0.0270, Val Loss = 0.0401
Epoch 18/50: Train Loss = 0.0271, Val Loss = 0.0428
Epoch 19/50: Train Loss = 0.0267, Val Loss = 0.0414
Epoch 20/50: Train Lo



Epoch 1/50: Train Loss = 0.1097, Val Loss = 0.0751
Epoch 2/50: Train Loss = 0.0923, Val Loss = 0.1212
Epoch 3/50: Train Loss = 0.0796, Val Loss = 0.0849
Epoch 4/50: Train Loss = 0.0864, Val Loss = 0.0671
Epoch 5/50: Train Loss = 0.0809, Val Loss = 0.0830
Epoch 6/50: Train Loss = 0.0845, Val Loss = 0.0900
Epoch 7/50: Train Loss = 0.0896, Val Loss = 0.0907
Epoch 8/50: Train Loss = 0.0769, Val Loss = 0.0740
Epoch 9/50: Train Loss = 0.0613, Val Loss = 0.0560
Epoch 10/50: Train Loss = 0.0574, Val Loss = 0.0516
Epoch 11/50: Train Loss = 0.0545, Val Loss = 0.0485
Epoch 12/50: Train Loss = 0.0536, Val Loss = 0.0525
Epoch 13/50: Train Loss = 0.0521, Val Loss = 0.0514
Epoch 14/50: Train Loss = 0.0530, Val Loss = 0.0564
Epoch 15/50: Train Loss = 0.0533, Val Loss = 0.0547
Epoch 16/50: Train Loss = 0.0419, Val Loss = 0.0478
Epoch 17/50: Train Loss = 0.0409, Val Loss = 0.0471
Epoch 18/50: Train Loss = 0.0395, Val Loss = 0.0442
Epoch 19/50: Train Loss = 0.0381, Val Loss = 0.0441
Epoch 20/50: Train Lo



Epoch 1/50: Train Loss = 0.2198, Val Loss = 0.1610
Epoch 2/50: Train Loss = 0.2011, Val Loss = 0.1378
Epoch 3/50: Train Loss = 0.1815, Val Loss = 0.1214
Epoch 4/50: Train Loss = 0.1822, Val Loss = 0.1144
Epoch 5/50: Train Loss = 0.1955, Val Loss = 0.1628
Epoch 6/50: Train Loss = 0.1852, Val Loss = 0.1376
Epoch 7/50: Train Loss = 0.1918, Val Loss = 0.1584
Epoch 8/50: Train Loss = 0.1995, Val Loss = 0.1385
Epoch 9/50: Train Loss = 0.1507, Val Loss = 0.1100
Epoch 10/50: Train Loss = 0.1475, Val Loss = 0.1118
Epoch 11/50: Train Loss = 0.1336, Val Loss = 0.1048
Epoch 12/50: Train Loss = 0.1282, Val Loss = 0.1115
Epoch 13/50: Train Loss = 0.1211, Val Loss = 0.0958
Epoch 14/50: Train Loss = 0.1244, Val Loss = 0.1066
Epoch 15/50: Train Loss = 0.1398, Val Loss = 0.1291
Epoch 16/50: Train Loss = 0.1705, Val Loss = 0.1148
Epoch 17/50: Train Loss = 0.1260, Val Loss = 0.1007
Epoch 18/50: Train Loss = 0.1067, Val Loss = 0.0850
Epoch 19/50: Train Loss = 0.0946, Val Loss = 0.0769
Epoch 20/50: Train Lo