In [1]:
import torch
from transformers import AutoTokenizer
from urllib.parse import urlparse
import re
import unicodedata
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
from torch.utils.data import TensorDataset,DataLoader
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model,PeftModel
from transformers import BitsAndBytesConfig
from datetime import datetime
import numpy as np 

In [2]:
 def hasMisleadingChars(url):
  for char in url:
    if not (char.isascii() or char.isspace()):
      category = unicodedata.category(char)
      if category.startswith("L") and not unicodedata.combining(char):
        return True
  return  False

In [4]:
data = pd.read_csv("train_data.csv")

In [3]:
def preprocess_url(url):
    url = re.sub(r'https?://', '', url)
    parts = url.split('/', 1)
    domain = parts[0]
    path = parts[1] if len(parts) > 1 else ""
    text_rep = f"{domain} {path.replace('/', ' ')}"

    return text_rep

In [4]:
torch.device("cuda" if torch.cuda.is_available() else "cpu")

device(type='cuda')

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,url,type
0,131349,youtube.com/watch?v=PLXMkdAXDZw,benign
1,132471,http://www.controlesuasvendas.com/controle-de-...,defacement
2,400199,manta.com/c/mmf3v8b/commerce-bancshares-inc,benign
3,63241,http://www.osn-solutions.nl/index.php/nl.1,defacement
4,296957,www.ectc.org/resources/ClassRECP04.pdf,phishing


In [8]:
data['type'] = data['type'].map(lambda x: 1 if x == 'benign' else 0)

In [10]:
def finetune_deberta(urls, labels, model_name="microsoft/deberta-base", epochs=3, batch_size=8,qlora=False,lora_r=0,lora_alpha=0):

    processed_urls = [preprocess_url(url) for url in urls]

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        processed_urls, labels, test_size=0.15, stratify=labels, random_state=1
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = None

    if qlora:

        quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",  # NormalFloat4 quantization (best for LLMs)
        bnb_4bit_compute_dtype=torch.float16,  # Use FP16 for computation
        bnb_4bit_use_double_quant=True,  # Further reduces memory usage
        )

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2,
            quantization_config=quantization_config,  # Use the correct config format
            device_map="auto",
        )
        
        lora_config = LoraConfig(
            r=lora_r,  
            lora_alpha=lora_alpha, 
            lora_dropout=0.1,
            bias="none",
            target_modules=[f"deberta.encoder.layer.{i}.attention.self.in_proj" for i in range(12)]
         )
        model = get_peft_model(model, lora_config)
        print("using qlora")
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=2
    )

    train_encodings = tokenizer(
        train_texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )

    val_encodings = tokenizer(
        val_texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )

    train_labels = torch.tensor(train_labels.to_numpy())
    val_labels = torch.tensor(val_labels.to_numpy())

    train_dataset = TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        train_labels
    )

    val_dataset = TensorDataset(
        val_encodings['input_ids'],
        val_encodings['attention_mask'],
        val_labels
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size*2)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)

    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0.05*total_steps,
        num_training_steps=total_steps
    )

    train_losses = []
    val_accuracies = []
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        batch_number = 0
        batch_start = datetime.now()

        for batch in train_loader:
            batch_number += 1
            if batch_number%1000 == 0:
                print(f"{batch_number} batches have completed in epoch {epoch}")
                print(f"time taken: {datetime.now()-batch_start}")
                batch_start = datetime.now()

                if batch_number%10000== 0:
                    path = f"/kaggle/working/saves/deberta{batch//10000}_{epoch}"
                    os.makedirs(model_path, exist_ok=True)
                    model.save_pretrained(path)
                    tokenizer.save_pretrained(path)

            optimizer.zero_grad()

            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()

        avg_train_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        model.eval()
        val_accuracy = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                predictions = torch.argmax(outputs.logits, dim=-1)
                val_accuracy += (predictions == labels).sum().item()

        val_accuracy /= len(val_dataset)
        val_accuracies.append(val_accuracy)

        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_train_loss:.4f} - Val Accuracy: {val_accuracy:.4f}")

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses)
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')

    plt.subplot(1, 2, 2)
    plt.plot(val_accuracies)
    plt.title('Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')

    plt.tight_layout()
    plt.savefig('deberta_training_progress.png')

    model_path = "./finetuned-deberta-url-classifier"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

    return model, tokenizer, model_path

In [None]:
deberta_model, tokenizer, model_path = finetune_deberta(
        data['url'], data['type'], model_name="microsoft/deberta-base", epochs=3, qlora=True,lora_r=8,lora_alpha=32,batch_size=16
    )

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using qlora


In [5]:
model_path = "deberta_v3-1"
base_model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base")
model = PeftModel.from_pretrained(base_model, model_path, is_trainable=False)
tokenizer = AutoTokenizer.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModel(
  (base_model): LoraModel(
    (model): DebertaV2ForSequenceClassification(
      (deberta): DebertaV2Model(
        (embeddings): DebertaV2Embeddings(
          (word_embeddings): Embedding(128100, 768, padding_idx=0)
          (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): DebertaV2Encoder(
          (layer): ModuleList(
            (0-11): 12 x DebertaV2Layer(
              (attention): DebertaV2Attention(
                (self): DisentangledSelfAttention(
                  (query_proj): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
           

In [6]:
data = pd.read_csv("train_data.csv")
X = [preprocess_url(url) for url in data['url']]
y = [label for label in data['type']]
batch_size = 16
dataloader = DataLoader(list(zip(X, y)), batch_size=batch_size, shuffle=False)

In [11]:
print(torch.cuda.get_device_name(0))
x = torch.rand(1).to("cuda")
print(x.device)
print(next(model.parameters()).device)
torch.backends.cudnn.benchmark = True

NVIDIA GeForce RTX 3050 Laptop GPU
cuda:0
cuda:0


In [12]:
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch_texts, batch_labels in dataloader:
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).tolist()

        all_predictions.extend(predictions)
        all_labels.extend(batch_labels)


In [13]:
#To improve the training speed, I am extracting the outputs of the DeBert model and storing it to disk for later use
data["bert_output"] = all_predictions
data.to_csv("bertDataV3.csv")

In [14]:
def extract_url_features(url,bert_output):
    url = url.strip().lower()
    protocol = 1 if urlparse(url).scheme == 'https' else 0
    url = re.sub(r"https?://","",url)
    parts = url.split("/",1)
    domain = parts[0]
    path = parts[1] if len(parts)>1 else ""

    features = {
        "domain_length" : len(domain),
        "subdomains" : domain.count('.'),
        'num_dots': url.count('.'),
         'num_equals': url.count('='),
         'protocol': protocol,
        "missing_chars": hasMisleadingChars(url),
        'bert_output': bert_output
    }

    return np.array(list(features.values()))

In [15]:
nn_data = pd.read_csv("./bertDataV3.csv")
nn_data['type']  = nn_data['type'].map(lambda x: 1 if x == 'benign' else 0)

In [16]:
nn_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,url,type,bert_output
0,0,131349,youtube.com/watch?v=PLXMkdAXDZw,1,0
1,1,132471,http://www.controlesuasvendas.com/controle-de-...,0,1
2,2,400199,manta.com/c/mmf3v8b/commerce-bancshares-inc,1,0
3,3,63241,http://www.osn-solutions.nl/index.php/nl.1,0,0
4,4,296957,www.ectc.org/resources/ClassRECP04.pdf,0,1


In [17]:
from xgboost import XGBClassifier
X = nn_data.apply(lambda row: extract_url_features(row['url'], row['bert_output']), axis=1).tolist()
y = nn_data['type'].tolist()
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,test_size=0.15)

In [18]:
len(X_train),len(X_test)

(271624, 47934)

In [19]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score,precision_recall_fscore_support,classification_report
predictions = xgb.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average="binary")
report = classification_report(y_test, predictions, target_names=["Negative", "Positive"])

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", report)

In [21]:
#  removing bert from gpu memory
del model
torch.cuda.empty_cache()

In [25]:
print(xgb.feature_importances_)

[0.01779902 0.12762356 0.03104141 0.02038754 0.11529641 0.01909503
 0.668757  ]


The bert output is overwhelmingly important <br>
Subdomains and protocol have some influence<br>
Rest of the features don't have much influence.<br>
Misleading characters is only present in a very small part of the dataset,so its importance is undermined<br>
Protocol is https in majority of the database.<br>
domain_length, subdomains and numdots are not influencing the output.<br>

<h2>Multi Input Neural network</h2>
As bert_output can overshadow the importance of other features, we first 