In [1]:
import pandas as pd
import random

# Load datasets

phish_df = pd.read_csv("data/raw_data/verified_online.csv")
tranco = pd.read_csv("data/raw_data/top-1m.csv", header=None, names=["rank", "domain"])

# Take top 100k domains
benign_domains = tranco.iloc[:100000]["domain"]
benign_urls = "https://" + benign_domains
benign_df = pd.DataFrame({"url": benign_urls, "label": 0})


phish_df.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,9289582,https://hlaly.rouseauhealth.com/qiwlr,http://www.phishtank.com/phish_detail.php?phis...,2025-12-13T05:33:02+00:00,yes,2025-12-13T06:02:44+00:00,yes,Other
1,9289552,https://allegro.pl-oferta814932.sbs,http://www.phishtank.com/phish_detail.php?phis...,2025-12-13T05:23:29+00:00,yes,2025-12-13T05:41:57+00:00,yes,Allegro
2,9289520,https://hinnf.rouseauhealth.com/qiwlr,http://www.phishtank.com/phish_detail.php?phis...,2025-12-13T05:05:55+00:00,yes,2025-12-13T05:12:05+00:00,yes,Other
3,9289519,http://allegrolokalnie.oferty-kategoria-propon...,http://www.phishtank.com/phish_detail.php?phis...,2025-12-13T05:00:04+00:00,yes,2025-12-13T05:02:53+00:00,yes,Allegro
4,9289518,http://allegrolokalnie.oferty-kategoria-propon...,http://www.phishtank.com/phish_detail.php?phis...,2025-12-13T04:59:59+00:00,yes,2025-12-13T05:02:53+00:00,yes,Allegro


In [2]:
patterns = [
    "https://{}",
    "https://www.{}/",
    "https://{}/home",
    "https://{}/login",
    "https://{}/account",
    "https://www.{}/products",
]

def generate_url(domain):
    pattern = random.choice(patterns)
    return pattern.format(domain)

benign_urls = [generate_url(d) for d in benign_domains]
benign_df = pd.DataFrame({"url": benign_urls, "label": 0})

# Select only URL column
phish_df = phish_df[["url"]].copy()

# Drop rows with missing or invalid URLs
phish_df = phish_df.dropna(subset=["url"])
phish_df = phish_df[phish_df["url"].str.startswith(("http", "https"))]

# Add label = 1 for phishing
phish_df["label"] = 1

print(len(phish_df))
phish_df["url"] = phish_df["url"].str.strip()
phish_df["url"] = phish_df["url"].str.replace(" ", "")
phish_df["url"] = phish_df["url"].astype(str)
phish_df["url"] = phish_df["url"].str.lower()


phish_df.head()


46919


Unnamed: 0,url,label
0,https://hlaly.rouseauhealth.com/qiwlr,1
1,https://allegro.pl-oferta814932.sbs,1
2,https://hinnf.rouseauhealth.com/qiwlr,1
3,http://allegrolokalnie.oferty-kategoria-propon...,1
4,http://allegrolokalnie.oferty-kategoria-propon...,1


In [3]:
# Join datasets together
df = pd.concat([phish_df, benign_df], ignore_index=True)
df.shape

(146919, 2)

## Tier One feature engineering

In [4]:
# Import all dependencies
import re
from urllib.parse import urlparse
from math import log2

# Suspicious keywords
SUSPICIOUS_KEYWORDS = [
    "login", "verify", "account", "update", "secure",
    "bank", "signin", "password", "confirm", "safe"
]

# ---- Helper functions ----

def is_ip(domain):
    """Check if domain is an IP address."""
    return bool(re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain))

def entropy(s):
    """Calculate Shannon entropy of URL."""
    if len(s) == 0:
        return 0
    p = [s.count(c)/len(s) for c in set(s)]
    return -sum(px * log2(px) for px in p)

def extract_features(url):
    """Extract Level-1 features from a URL."""
    try:
        parsed = urlparse(url)
    except:
        return pd.Series([None] * 15)

    domain = parsed.netloc
    path = parsed.path

    # Basic numeric features
    url_length        = len(url)
    num_digits        = sum(c.isdigit() for c in url)
    num_special_chars = sum(not c.isalnum() for c in url)
    dot_count         = url.count(".")
    hyphen_in_domain  = 1 if "-" in domain else 0
    at_symbol         = 1 if "@" in url else 0
    double_slash      = 1 if url.count("//") > 1 else 0

    # Domain / Path features
    subdomain_count   = domain.count(".") - 1 if domain else 0
    domain_length     = len(domain)
    path_length       = len(path)

    # IP address check
    ip_flag = 1 if is_ip(domain) else 0

    # Suspicious keywords
    keyword_flag = 1 if any(k in url.lower() for k in SUSPICIOUS_KEYWORDS) else 0

    # Entropy
    url_entropy = entropy(url)

    return pd.Series([
        url_length,
        num_digits,
        num_special_chars,
        dot_count,
        hyphen_in_domain,
        at_symbol,
        double_slash,
        subdomain_count,
        domain_length,
        path_length,
        ip_flag,
        keyword_flag,
        url_entropy
    ])

# ---- Apply to dataset ----

TIER1_COLS = [
    "url_length",
    "num_digits",
    "num_special_chars",
    "dot_count",
    "hyphen_in_domain",
    "at_symbol",
    "double_slash",
    "subdomain_count",
    "domain_length",
    "path_length",
    "ip_flag",
    "keyword_flag",
    "url_entropy"
]

df[TIER1_COLS] = df["url"].apply(extract_features)

df.head()


Unnamed: 0,url,label,url_length,num_digits,num_special_chars,dot_count,hyphen_in_domain,at_symbol,double_slash,subdomain_count,domain_length,path_length,ip_flag,keyword_flag,url_entropy
0,https://hlaly.rouseauhealth.com/qiwlr,1,37.0,0.0,6.0,2.0,0.0,0.0,0.0,1.0,23.0,6.0,0.0,0.0,4.067165
1,https://allegro.pl-oferta814932.sbs,1,35.0,6.0,6.0,2.0,1.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,4.321721
2,https://hinnf.rouseauhealth.com/qiwlr,1,37.0,0.0,6.0,2.0,0.0,0.0,0.0,1.0,23.0,6.0,0.0,0.0,4.195676
3,http://allegrolokalnie.oferty-kategoria-propon...,1,73.0,5.0,10.0,2.0,1.0,0.0,0.0,1.0,66.0,0.0,0.0,0.0,4.394598
4,http://allegrolokalnie.oferty-kategoria-propon...,1,73.0,5.0,10.0,2.0,1.0,0.0,0.0,1.0,66.0,0.0,0.0,0.0,4.356859


## Tier 2 feature engineering

In [5]:
# 11. Subdomain count
def count_subdomains(domain):
    if not domain:
        return 0
    parts = domain.split(".")
    # Remove TLD + main domain → subdomains left
    return max(0, len(parts) - 2)

# 13. Domain length + TLD flag
# ---------------------------
malicious_tlds = {".ru", ".tk", ".ml", ".xyz", ".info", ".top", ".ga", ".gq", ".cf"}

def domain_features(url):
    parsed = urlparse(url)
    domain = parsed.netloc

    if not domain:
        return pd.Series([0, 0])

    domain_len = len(domain)

    # Extract TLD (.com, .net, .ru, etc.)
    tld = "." + domain.split(".")[-1]

    tld_flag = 1 if tld in malicious_tlds else 0

    return pd.Series([domain_len, tld_flag])

# 14. Encoded characters flag
def encoded_char_flag(url):
    encodings = ["%20", "%2F", "%3D", "%3F", "%40", "%25"]
    return 1 if any(code in url for code in encodings) else 0


# 15. Path length
def path_length(url):
    parsed = urlparse(url)
    return len(parsed.path) if parsed.path else 0

# ---------------------------
# Apply Tier-2 features
df["url_entropy"] = df["url"].apply(entropy)

df["subdomain_count"] = df["url"].apply(
    lambda x: count_subdomains(urlparse(x).netloc)
)
df[["domain_length_t2", "malicious_tld_flag"]] = df["url"].apply(domain_features)

df["encoded_flag"] = df["url"].apply(encoded_char_flag)

df["path_length_t2"] = df["url"].apply(path_length)


## Tier 3 - feature engineering

In [22]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm

# Load model
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def get_batch_embeddings(text_batch):
    """Compute CLS embeddings for a batch of URLs."""
    # Tokenize batch
    inputs = tokenizer(
        text_batch,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # CLS token embedding
    cls_embeddings = outputs.last_hidden_state[:, 0, :]

    # Move to CPU and convert to numpy
    return cls_embeddings.cpu().numpy()


# ------------------------------
# BATCHEd Processing
# ------------------------------
BATCH_SIZE = 64
urls = df["url"].tolist()

all_embeddings = []

for i in tqdm(range(0, len(urls), BATCH_SIZE), desc="Embedding URLs"):
    batch_urls = urls[i : i + BATCH_SIZE]
    batch_emb = get_batch_embeddings(batch_urls)
    all_embeddings.append(batch_emb)

# Stack all into final array
embeddings = np.vstack(all_embeddings)
np.save("embeddings.npy", embeddings)
print("Final embedding shape:", embeddings.shape)


Embedding URLs: 100%|██████████| 2296/2296 [06:27<00:00,  5.92it/s]


Final embedding shape: (146919, 768)


In [17]:
df.head()

Unnamed: 0,url,label,url_length,num_digits,num_special_chars,dot_count,hyphen_in_domain,at_symbol,double_slash,subdomain_count,domain_length,path_length,ip_flag,keyword_flag,url_entropy,domain_length_t2,malicious_tld_flag,encoded_flag,path_length_t2
0,https://hlaly.rouseauhealth.com/qiwlr,1,37.0,0.0,6.0,2.0,0.0,0.0,0.0,1,23.0,6.0,0.0,0.0,4.067165,23,0,0,6
1,https://allegro.pl-oferta814932.sbs,1,35.0,6.0,6.0,2.0,1.0,0.0,0.0,1,27.0,0.0,0.0,0.0,4.321721,27,0,0,0
2,https://hinnf.rouseauhealth.com/qiwlr,1,37.0,0.0,6.0,2.0,0.0,0.0,0.0,1,23.0,6.0,0.0,0.0,4.195676,23,0,0,6
3,http://allegrolokalnie.oferty-kategoria-propon...,1,73.0,5.0,10.0,2.0,1.0,0.0,0.0,1,66.0,0.0,0.0,0.0,4.394598,66,0,0,0
4,http://allegrolokalnie.oferty-kategoria-propon...,1,73.0,5.0,10.0,2.0,1.0,0.0,0.0,1,66.0,0.0,0.0,0.0,4.356859,66,0,0,0


## Testing evaluation

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, average_precision_score, confusion_matrix, precision_recall_curve
)
import numpy as np

class MetricsCalculator:
    def __init__(self):
        pass

    def compute_metrics(self, y_true, y_pred, y_scores=None):
        """Compute and return a dictionary of metrics."""
        metrics = {}
        metrics['accuracy'] = accuracy_score(y_true, y_pred)
        metrics['precision'] = precision_score(y_true, y_pred)
        metrics['recall'] = recall_score(y_true, y_pred)
        metrics['f1'] = f1_score(y_true, y_pred)

        if y_scores is not None:
            metrics['average_precision'] = average_precision_score(y_true, y_scores)

        return metrics

    def plot_confusion_matrix(self, y_true, y_pred, labels=None):
        """Plot a confusion matrix using seaborn."""
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                    xticklabels=labels, yticklabels=labels)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.title('Confusion Matrix')
        plt.show()

    def plot_precision_recall_curve(self, y_true, y_scores):
        """Plot a precision-recall curve."""
        precision, recall, _ = precision_recall_curve(y_true, y_scores)
        plt.figure(figsize=(6, 4))
        plt.plot(recall, precision, marker='.')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.grid(True)
        plt.show()

    def evaluate(self, y_true, y_pred, y_scores=None):
        """Compute metrics and display visualizations."""
        metrics = self.compute_metrics(y_true, y_pred, y_scores)
        print("Metrics:")
        for key, value in metrics.items():
            print(f"{key}: {value:.4f}")

        # Plot the confusion matrix
        self.plot_confusion_matrix(y_true, y_pred, labels=["Benign", "Phishing"])

        # Plot the precision-recall curve if scores are provided
        if y_scores is not None:
            self.plot_precision_recall_curve(y_true, y_scores)


In [16]:
df.columns

Index(['url', 'label', 'url_length', 'num_digits', 'num_special_chars',
       'dot_count', 'hyphen_in_domain', 'at_symbol', 'double_slash',
       'subdomain_count', 'domain_length', 'path_length', 'ip_flag',
       'keyword_flag', 'url_entropy', 'domain_length_t2', 'malicious_tld_flag',
       'encoded_flag', 'path_length_t2'],
      dtype='object')