In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset  
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import argparse
import os

In [2]:
# Load Dataset
def load_dataset():
    df = pd.read_csv('dataset/XSS_dataset.csv', encoding='utf-8-sig')
    df = df[df.columns[-2:]]  # Only get sentence and labels

    # Get Sentences data from data frame
    sentences = df['Sentence'].values
    labels = df['Label'].values

    return sentences, labels

In [3]:
# Convert to ASCII
def convert_to_ascii(sentence):
    sentence_ascii = []
    for char in sentence:
        if ord(char) < 8222:
            if ord(char) == 8217:  # '
                sentence_ascii.append(134)
            elif ord(char) == 8221:  # "
                sentence_ascii.append(129)
            elif ord(char) == 8220:  # "
                sentence_ascii.append(130)
            elif ord(char) == 8216:  # '
                sentence_ascii.append(131)
            elif ord(char) == 8211:  # –
                sentence_ascii.append(133)
            if ord(char) <= 128:
                sentence_ascii.append(ord(char))
    zer = np.zeros((10000,))
    for i in range(len(sentence_ascii)):
        zer[i] = sentence_ascii[i]
    zer.shape = (100, 100)
    return zer


In [4]:
# Prepare Data
def prepare_data(sentences):
    arr = np.zeros((len(sentences), 100, 100))
    for i in range(len(sentences)):
        image = convert_to_ascii(sentences[i])
        x = np.asarray(image, dtype='float')
        image = cv2.resize(x, dsize=(5, 5), interpolation=cv2.INTER_CUBIC)
        image /= 128
        arr[i] = image

    # Reshape data for input to CNN
    data = arr.reshape(arr.shape[0], 1, 100, 100)
    return data

In [5]:
# Create PyTorch Dataset
class XSSDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [6]:
# Define CNN Model
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(256 * 12 * 12, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(nn.ReLU()(self.conv1(x)))
        x = self.pool(nn.ReLU()(self.conv2(x)))
        x = self.pool(nn.ReLU()(self.conv3(x)))
        x = x.view(x.size(0), -1)
        x = nn.ReLU()(self.fc1(x))
        x = nn.ReLU()(self.fc2(x))
        x = nn.ReLU()(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x


In [7]:
def train_model(learning_rate=0.001, num_epochs=18):
    # Load and prepare data
    sentences, labels = load_dataset()
    
    # Prepare data
    data = prepare_data(sentences)

    # Split data: train 70%, verify 20%, test 10%
    train_data, temp_data, train_labels, temp_labels = train_test_split(data, labels, test_size=0.3, random_state=42)
    verify_data, test_data, verify_labels, test_labels = train_test_split(temp_data, temp_labels, test_size=1/3, random_state=42)

    # Create datasets and loaders
    train_dataset = XSSDataset(train_data, train_labels)
    verify_dataset = XSSDataset(verify_data, verify_labels)
    test_dataset = XSSDataset(test_data, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    verify_loader = DataLoader(verify_dataset, batch_size=128, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    # Initialize model, loss, and optimizer
    model = CNNModel()
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    loss_train = []
    loss_verify = []

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs.float())
            loss = criterion(outputs.squeeze(), labels.float())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        loss_train.append(epoch_loss / len(train_loader))

        # Validation loss
        model.eval()
        verify_loss = 0
        with torch.no_grad():
            for inputs, labels in verify_loader:
                outputs = model(inputs.float())
                loss = criterion(outputs.squeeze(), labels.float())
                verify_loss += loss.item()
        loss_verify.append(verify_loss / len(verify_loader))

        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss_train[-1]:.4f}, Verify Loss: {loss_verify[-1]:.4f}")

    # Save model weights
    os.makedirs('models', exist_ok=True)
    torch.save(model.state_dict(), 'models/xss_detection_model.pth')
    print("Model weights saved to models/xss_detection_model.pth")

    # Evaluate on test set
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs.float())
            preds = (outputs.squeeze() > 0.5).float()
            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print("\nTest Set Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")


In [8]:
def predict(input_sentence):
    # Load the model
    model = CNNModel()
    model.load_state_dict(torch.load('models/xss_detection_model.pth'))
    model.eval()

    # Prepare input
    image = convert_to_ascii(input_sentence)
    x = np.asarray(image, dtype='float')
    image = cv2.resize(x, dsize=(100, 100), interpolation=cv2.INTER_CUBIC)
    image /= 128
    input_data = image.reshape(1, 1, 100, 100)
    input_tensor = torch.tensor(input_data, dtype=torch.float32)

    # Predict
    with torch.no_grad():
        output = model(input_tensor)
        prediction = output.squeeze().item()

    return prediction > 0.5, prediction

In [9]:
def main():
    parser = argparse.ArgumentParser(description='XSS Detection CNN')
    parser.add_argument('--train', action='store_true', help='Train the model')
    parser.add_argument('--predict', type=str, help='Predict XSS for a given sentence')
    parser.add_argument('--lr', type=float, default=0.001, help='Learning rate for training')
    args = parser.parse_args()

    if args.train:
        train_model(learning_rate=args.lr)
    
    if args.predict:
        is_xss, confidence = predict(args.predict)
        print(f"Prediction: {'XSS' if is_xss else 'Not XSS'}")
        print(f"Confidence: {confidence:.4f}")

In [10]:
if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] [--train] [--predict PREDICT] [--lr LR]
ipykernel_launcher.py: error: unrecognized arguments: --f=/home/is1ab/.local/share/jupyter/runtime/kernel-v2-1387757vHpZ6LXZXBno.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
