In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re
from typing import List, Tuple
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import os
import numpy as np

In [2]:
class TextPreprocessor:
    def __init__(self, max_len: int = 100):
        self.max_len = max_len
        self.vocab = {'<PAD>': 0, '<UNK>': 1}
        self.vocab_size = 2
    
    def tokenize(self, text: str) -> List[str]:
        # Convert input to string and clean it
        text = str(text)
        text = re.sub(r'([<>/="])', r' \1 ', text)
        text = ' '.join(text.split())
        return text.lower().split()
    
    def build_vocab(self, texts: List[str], min_freq: int = 2):
        counter = Counter()
        for text in texts:
            # Ensure text is string
            text = str(text)
            tokens = self.tokenize(text)
            counter.update(tokens)
        
        for word, freq in counter.items():
            if freq >= min_freq and word not in self.vocab:
                self.vocab[word] = self.vocab_size
                self.vocab_size += 1
    
    def encode_text(self, text: str) -> List[int]:
        # Ensure text is string
        text = str(text)
        tokens = self.tokenize(text)
        if len(tokens) > self.max_len:
            tokens = tokens[:self.max_len]
        else:
            tokens.extend(['<PAD>'] * (self.max_len - len(tokens)))
        return [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]


In [3]:
class XSSDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[int], preprocessor: TextPreprocessor):
        # Convert all texts to strings
        self.texts = [str(text) for text in texts]
        self.preprocessor = preprocessor
        self.encodings = [self.preprocessor.encode_text(text) for text in self.texts]
        self.labels = [int(label) for label in labels]  # Convert labels to int

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]:
        return (torch.tensor(self.encodings[idx], dtype=torch.long),
                torch.tensor(self.labels[idx], dtype=torch.float))

def load_and_clean_data(file_path: str) -> Tuple[List[str], List[int]]:
    """Load and clean the dataset, ensuring proper data types."""
    try:
        # Read the CSV file
        data = pd.read_csv(file_path)
        
        # Convert texts to strings and clean them
        texts = [str(text).strip() for text in data['Sentence']]
        
        # Convert labels to integers
        labels = [int(label) for label in data['Label']]
        
        # Basic validation
        assert len(texts) == len(labels), "Number of texts and labels must match"
        assert all(isinstance(text, str) for text in texts), "All texts must be strings"
        assert all(isinstance(label, int) and label in [0, 1] for label in labels), "Labels must be binary (0 or 1)"
        
        print(f"Loaded {len(texts)} samples successfully")
        
        # Print some basic statistics
        print(f"Number of positive samples: {sum(labels)}")
        print(f"Number of negative samples: {len(labels) - sum(labels)}")
        
        return texts, labels
    
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        raise


In [4]:
class XSSDetectorLSTM(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int = 50, 
                 hidden_dim: int = 64, num_layers: int = 2, dropout: float = 0.3):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=True
        )
        self.fc1 = nn.Linear(hidden_dim * 2, 32)
        self.fc2 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        out = self.dropout(hidden)
        out = torch.relu(self.fc1(out))
        out = torch.sigmoid(self.fc2(out))
        return out

In [5]:
class XSSDetector:
    def __init__(self, max_len: int = 100, device: str = None):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else torch.device(device)
        print(f"Using device: {self.device}")
        
        self.max_len = max_len
        self.preprocessor = TextPreprocessor(max_len)
        self.model = None
        self.results = {}
    
    def train(self, texts: List[str], labels: List[int], 
              epochs: int = 20, batch_size: int = 16,  
              learning_rates: List[float] = [0.001, 0.002, 0.01, 0.02, 0.05]):
        
        try:
            texts = [str(text) for text in texts]
            labels = torch.tensor(labels, dtype=torch.float)
            
            self.preprocessor.build_vocab(texts)
            dataset = XSSDataset(texts, labels.numpy(), self.preprocessor)
            
            # Split dataset
            train_size = int(0.7 * len(dataset))
            val_size = int(0.2 * len(dataset))
            test_size = len(dataset) - train_size - val_size
            
            train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
                dataset, [train_size, val_size, test_size]
            )
            
            print(f"\nDataset splits:")
            print(f"Training: {train_size} samples")
            print(f"Validation: {val_size} samples")
            print(f"Test: {test_size} samples")
            
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size)
            
            # Train with different learning rates
            for lr in learning_rates:
                print(f"\nTraining with learning rate: {lr}")
                print(f"Current learning rate: {lr}")
                
                self.model = XSSDetectorLSTM(
                    vocab_size=self.preprocessor.vocab_size,
                    embedding_dim=50,
                    dropout=0.3  # Increased dropout for regularization
                ).to(self.device)
                
                optimizer = torch.optim.Adam(self.model.parameters(), lr=lr, weight_decay=1e-5)
                criterion = nn.BCELoss()
                
                # Learning rate scheduler for better convergence
                scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                    optimizer, mode='min', factor=0.5, patience=3, verbose=False
                )
                
                train_losses = []
                val_losses = []
                
                best_val_loss = float('inf')
                patience_counter = 0
                
                for epoch in range(epochs):
                    # Training
                    self.model.train()
                    total_loss = 0
                    train_correct = 0
                    train_total = 0
                    
                    for batch_sequences, batch_labels in train_loader:
                        batch_sequences = batch_sequences.to(self.device)
                        batch_labels = batch_labels.to(self.device)
                        
                        optimizer.zero_grad()
                        outputs = self.model(batch_sequences).squeeze()
                        loss = criterion(outputs, batch_labels)
                        loss.backward()
                        optimizer.step()
                        
                        total_loss += loss.item()
                        predictions = (outputs >= 0.5).float()
                        train_correct += (predictions == batch_labels).sum().item()
                        train_total += len(batch_labels)
                    
                    avg_train_loss = total_loss / len(train_loader)
                    train_losses.append(avg_train_loss)
                    
                    # Validation
                    self.model.eval()
                    val_loss = 0
                    val_correct = 0
                    val_total = 0
                    val_predictions = []
                    val_true_labels = []
                    
                    with torch.no_grad():
                        for batch_sequences, batch_labels in val_loader:
                            batch_sequences = batch_sequences.to(self.device)
                            batch_labels = batch_labels.to(self.device)
                            
                            outputs = self.model(batch_sequences).squeeze()
                            batch_val_loss = criterion(outputs, batch_labels).item()
                            val_loss += batch_val_loss
                            
                            predictions = (outputs >= 0.5).float()
                            val_correct += (predictions == batch_labels).sum().item()
                            val_total += len(batch_labels)
                            
                            val_predictions.extend(predictions.cpu().numpy())
                            val_true_labels.extend(batch_labels.cpu().numpy())
                    
                    avg_val_loss = val_loss / len(val_loader)
                    val_losses.append(avg_val_loss)
                    
                    # Learning rate scheduler step with current validation loss
                    scheduler.step(avg_val_loss)
                    current_lr = optimizer.param_groups[0]['lr']
                    print(f"Epoch {epoch+1}, Current LR: {current_lr:.6f}")
                    
                    # Early stopping
                    if avg_val_loss < best_val_loss:
                        best_val_loss = avg_val_loss
                        patience_counter = 0
                        # Save the best model
                        torch.save({
                            'model_state_dict': self.model.state_dict(),
                            'preprocessor': self.preprocessor
                        }, f'best_model_lr_{lr}.pth')
                    else:
                        patience_counter += 1
                    
                    if patience_counter > 5:
                        print("Early stopping triggered")
                        break
                    
                    if (epoch + 1) % 5 == 0:
                        print(f'Epoch {epoch+1}/{epochs}:')
                        print(f'Training Loss: {avg_train_loss:.4f}')
                        print(f'Training Accuracy: {100*train_correct/train_total:.2f}%')
                        print(f'Validation Loss: {avg_val_loss:.4f}')
                        print(f'Validation Accuracy: {100*val_correct/val_total:.2f}%')
                
                # Final evaluation
                val_f1 = f1_score(val_true_labels, val_predictions)
                
                # Store results
                self.results[lr] = {
                    'train_losses': train_losses,
                    'val_losses': val_losses,
                    'final_train_loss': avg_train_loss,
                    'final_val_loss': avg_val_loss,
                    'train_accuracy': 100 * train_correct / train_total,
                    'val_accuracy': 100 * val_correct / val_total,
                    'f1_score': val_f1
                }
                
                # Plot normalized training curves
                plt.figure(figsize=(10, 6))
                plt.plot(range(1, len(train_losses) + 1), 
                         [loss/train_losses[0] for loss in train_losses], 
                         label='Normalized Training Loss')
                plt.plot(range(1, len(val_losses) + 1), 
                         [loss/val_losses[0] for loss in val_losses], 
                         label='Normalized Validation Loss')
                plt.xlabel('Epoch')
                plt.ylabel('Loss (Normalized to First Epoch)')
                plt.title(f'Normalized Training and Validation Loss (Learning Rate: {lr})')
                plt.ylim(0, 8)  # Allow variation between 0 and 8 times the initial loss
                plt.legend()
                plt.grid(True)
                plt.tight_layout()
                plt.savefig(f'normalized_loss_plot_lr_{lr}.png')
                plt.close()
            
            # Print final results
            print("\nFinal Results for All Learning Rates:")
            print("\nLR      Train Loss  Val Loss    Train Acc   Val Acc    F1 Score")
            print("-" * 65)
            for lr in learning_rates:
                r = self.results[lr]
                print(f"{lr:.3f}  {r['final_train_loss']:.4f}     {r['final_val_loss']:.4f}     "
                      f"{r['train_accuracy']:.2f}%     {r['val_accuracy']:.2f}%     {r['f1_score']:.4f}")
        
        except Exception as e:
            import traceback
            print(f"Training error: {e}")
            traceback.print_exc()

In [6]:
# Pre-execution environment check
def check_environment():
    print("\n--- Environment Check ---")
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"CUDA device count: {torch.cuda.device_count()}")
    try:
        print(f"Current CUDA device: {torch.cuda.current_device()}")
    except:
        print("No CUDA device currently selected")

def demo_detector(dataset_path='Training Dataset/final_dataset.csv'):
    print("\n--- XSS Detection Model Demonstration ---")
    
    try:
        # Load and clean data
        texts, labels = load_and_clean_data(dataset_path)
        
        # Initialize and train detector
        detector = XSSDetector(max_len=100)
        detector.train(
            texts=texts,
            labels=labels,
            epochs=20,
            batch_size=16,
            learning_rates=[0.001, 0.002, 0.01, 0.02, 0.05]
        )
    
    except Exception as e:
        print(f"Demonstration failed: {e}")
        print("Possible issues:")
        print("1. Ensure correct dataset path")
        print("2. Check dataset format")
        print("3. Verify required libraries are installed")

In [7]:
if __name__ == "__main__":
    check_environment()
    demo_detector()


--- Environment Check ---
PyTorch version: 2.5.1+cu124
CUDA available: True
CUDA device count: 1
Current CUDA device: 0

--- XSS Detection Model Demonstration ---
Loaded 88310 samples successfully
Number of positive samples: 50590
Number of negative samples: 37720
Using device: cuda



Dataset splits:
Training: 61816 samples
Validation: 17662 samples
Test: 8832 samples

Training with learning rate: 0.001
Current learning rate: 0.001




Epoch 1, Current LR: 0.001000


Epoch 2, Current LR: 0.001000


Epoch 3, Current LR: 0.001000


Epoch 4, Current LR: 0.001000


Epoch 5, Current LR: 0.001000
Epoch 5/20:
Training Loss: 0.0284
Training Accuracy: 99.05%
Validation Loss: 0.0400
Validation Accuracy: 98.71%


Epoch 6, Current LR: 0.001000


Epoch 7, Current LR: 0.001000


Epoch 8, Current LR: 0.001000


Epoch 9, Current LR: 0.001000


Epoch 10, Current LR: 0.000500
Epoch 10/20:
Training Loss: 0.0245
Training Accuracy: 99.18%
Validation Loss: 0.0383
Validation Accuracy: 98.83%


Epoch 11, Current LR: 0.000500


Epoch 12, Current LR: 0.000500
Early stopping triggered

Training with learning rate: 0.002
Current learning rate: 0.002




Epoch 1, Current LR: 0.002000


Epoch 2, Current LR: 0.002000


Epoch 3, Current LR: 0.002000


Epoch 4, Current LR: 0.002000


Epoch 5, Current LR: 0.002000
Epoch 5/20:
Training Loss: 0.0290
Training Accuracy: 99.04%
Validation Loss: 0.0430
Validation Accuracy: 98.64%


Epoch 6, Current LR: 0.002000


Epoch 7, Current LR: 0.002000


Epoch 8, Current LR: 0.002000


Epoch 9, Current LR: 0.002000


Epoch 10, Current LR: 0.002000
Epoch 10/20:
Training Loss: 0.0275
Training Accuracy: 99.09%
Validation Loss: 0.0345
Validation Accuracy: 98.98%


Epoch 11, Current LR: 0.002000


Epoch 12, Current LR: 0.001000


Epoch 13, Current LR: 0.001000


Epoch 14, Current LR: 0.001000
Early stopping triggered

Training with learning rate: 0.01
Current learning rate: 0.01




Epoch 1, Current LR: 0.010000


Epoch 2, Current LR: 0.010000


Epoch 3, Current LR: 0.010000


Epoch 4, Current LR: 0.010000


Epoch 5, Current LR: 0.010000
Epoch 5/20:
Training Loss: 0.0498
Training Accuracy: 98.46%
Validation Loss: 0.0521
Validation Accuracy: 98.46%


Epoch 6, Current LR: 0.010000


Epoch 7, Current LR: 0.010000


Epoch 8, Current LR: 0.010000


Epoch 9, Current LR: 0.010000


Epoch 10, Current LR: 0.010000
Epoch 10/20:
Training Loss: 0.0459
Training Accuracy: 98.57%
Validation Loss: 0.0496
Validation Accuracy: 98.56%


Epoch 11, Current LR: 0.010000


Epoch 12, Current LR: 0.010000


Epoch 13, Current LR: 0.005000


Epoch 14, Current LR: 0.005000


Epoch 15, Current LR: 0.005000
Epoch 15/20:
Training Loss: 0.0332
Training Accuracy: 98.91%
Validation Loss: 0.0401
Validation Accuracy: 98.79%


Epoch 16, Current LR: 0.005000


Epoch 17, Current LR: 0.005000


Epoch 18, Current LR: 0.005000


Epoch 19, Current LR: 0.005000


Epoch 20, Current LR: 0.005000
Epoch 20/20:
Training Loss: 0.0305
Training Accuracy: 99.00%
Validation Loss: 0.0386
Validation Accuracy: 98.73%

Training with learning rate: 0.02
Current learning rate: 0.02




Epoch 1, Current LR: 0.020000


Epoch 2, Current LR: 0.020000


Epoch 3, Current LR: 0.020000


Epoch 4, Current LR: 0.020000


Epoch 5, Current LR: 0.020000
Epoch 5/20:
Training Loss: 0.0842
Training Accuracy: 97.39%
Validation Loss: 0.0641
Validation Accuracy: 97.98%


Epoch 6, Current LR: 0.020000


Epoch 7, Current LR: 0.020000


Epoch 8, Current LR: 0.020000


Epoch 9, Current LR: 0.020000


Epoch 10, Current LR: 0.020000
Epoch 10/20:
Training Loss: 0.0791
Training Accuracy: 97.61%
Validation Loss: 0.0636
Validation Accuracy: 98.07%


Epoch 11, Current LR: 0.020000


Epoch 12, Current LR: 0.010000


Epoch 13, Current LR: 0.010000


Epoch 14, Current LR: 0.010000


Epoch 15, Current LR: 0.010000
Epoch 15/20:
Training Loss: 0.0598
Training Accuracy: 98.14%
Validation Loss: 0.0474
Validation Accuracy: 98.61%


Epoch 16, Current LR: 0.010000


Epoch 17, Current LR: 0.010000


Epoch 18, Current LR: 0.010000


Epoch 19, Current LR: 0.010000


Epoch 20, Current LR: 0.010000
Epoch 20/20:
Training Loss: 0.0557
Training Accuracy: 98.40%
Validation Loss: 0.0474
Validation Accuracy: 98.58%

Training with learning rate: 0.05
Current learning rate: 0.05




Epoch 1, Current LR: 0.050000


Epoch 2, Current LR: 0.050000


Epoch 3, Current LR: 0.050000


Epoch 4, Current LR: 0.050000


Epoch 5, Current LR: 0.050000
Epoch 5/20:
Training Loss: 0.1770
Training Accuracy: 94.66%
Validation Loss: 0.1023
Validation Accuracy: 96.72%


Epoch 6, Current LR: 0.050000


Epoch 7, Current LR: 0.050000


Epoch 8, Current LR: 0.050000


Epoch 9, Current LR: 0.025000


Epoch 10, Current LR: 0.025000
Epoch 10/20:
Training Loss: 0.1176
Training Accuracy: 96.45%
Validation Loss: 0.0919
Validation Accuracy: 97.51%


Epoch 11, Current LR: 0.025000


Epoch 12, Current LR: 0.025000


Epoch 13, Current LR: 0.025000


Epoch 14, Current LR: 0.025000


Epoch 15, Current LR: 0.025000
Epoch 15/20:
Training Loss: 0.1073
Training Accuracy: 96.86%
Validation Loss: 0.0842
Validation Accuracy: 97.61%


Epoch 16, Current LR: 0.025000


Epoch 17, Current LR: 0.025000


Epoch 18, Current LR: 0.012500


Epoch 19, Current LR: 0.012500


Epoch 20, Current LR: 0.012500
Epoch 20/20:
Training Loss: 0.0865
Training Accuracy: 97.41%
Validation Loss: 0.0748
Validation Accuracy: 97.71%

Final Results for All Learning Rates:

LR      Train Loss  Val Loss    Train Acc   Val Acc    F1 Score
-----------------------------------------------------------------
0.001  0.0210     0.0389     99.28%     98.91%     0.9904
0.002  0.0230     0.0349     99.21%     98.92%     0.9904
0.010  0.0305     0.0386     99.00%     98.73%     0.9888
0.020  0.0557     0.0474     98.40%     98.58%     0.9875
0.050  0.0865     0.0748     97.41%     97.71%     0.9796
