In [26]:
import pandas as pd
import numpy as np

RANDOM_STATE = 42

# 1. LOAD DATA
# Recent phishing (label 1)
recent_phish = pd.read_csv("verified_online.csv")
# If it has more columns, assume 'url' exists:
if "url" in recent_phish.columns:
    recent_phish = recent_phish["url"]
recent_phish_df = pd.DataFrame({"url": recent_phish, "label": 1})

# Old data with labels (0/1)
old_data = pd.read_csv("more_data.csv").drop(columns=["Unnamed: 0"], errors="ignore")
old_phish_df = old_data[old_data["label"] == 1][["url"]].copy()
old_phish_df["label"] = 1
old_benign_df = old_data[old_data["label"] == 0][["url"]].copy()
old_benign_df["label"] = 0

# Benign from Tranco top 100k
tranco = pd.read_csv("top-1m.csv", header=None, names=["rank", "domain"])
benign_urls_tranco = "https://" + tranco.iloc[:100000]["domain"].astype(str).str.strip()
benign_tranco_df = pd.DataFrame({"url": benign_urls_tranco, "label": 0})

# Combine benign sources (Tranco + old benign)
benign_df = pd.concat([benign_tranco_df, old_benign_df], ignore_index=True)


# 2. DEDUP & CLEAN COLLISIONS
# Deduplicate within each pool
recent_phish_df = recent_phish_df.drop_duplicates(subset="url")
old_phish_df    = old_phish_df.drop_duplicates(subset="url")
benign_df       = benign_df.drop_duplicates(subset="url")

# Remove any benign URLs that accidentally appear in phishing sets
phish_urls_all = pd.concat([recent_phish_df[["url"]], old_phish_df[["url"]]])["url"].unique()
benign_df = benign_df[~benign_df["url"].isin(phish_urls_all)].copy()

# Remove overlap between old + recent phishing (prioritize "recent" as phishing)
recent_urls = set(recent_phish_df["url"])
old_phish_df = old_phish_df[~old_phish_df["url"].isin(recent_urls)].copy()

print("Recent phishing:", len(recent_phish_df))
print("Old phishing   :", len(old_phish_df))
print("Benign total   :", len(benign_df))


# 3. SPLIT RECENT PHISH INTO TRAIN / VAL / TEST
recent_phish_df = recent_phish_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
n_recent = len(recent_phish_df)

# You can tune these fractions if you want
val_frac  = 0.10
test_frac = 0.20
n_val_phish  = int(n_recent * val_frac)
n_test_phish = int(n_recent * test_frac)
n_train_recent_phish = n_recent - n_val_phish - n_test_phish
if n_train_recent_phish <= 0:
    raise ValueError("Not enough recent phishing URLs for chosen splits.")

val_recent_df   = recent_phish_df.iloc[:n_val_phish]
test_recent_df  = recent_phish_df.iloc[n_val_phish:n_val_phish + n_test_phish]
train_recent_df = recent_phish_df.iloc[n_val_phish + n_test_phish:]

print("\nRecent phishing split:")
print("  Train recent phish:", len(train_recent_df))
print("  Val phish         :", len(val_recent_df))
print("  Test phish        :", len(test_recent_df))


# 4. BUILD PHISH TRAIN POOL
# Training phishing = old + some recent (the remainder after val/test)
phish_train_pool = pd.concat([old_phish_df, train_recent_df], ignore_index=True)
phish_train_pool = phish_train_pool.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
print("\nTotal phishing available for TRAIN:", len(phish_train_pool))

# 5. ALLOCATE BENIGN TO VAL & TEST FIRST (NO OVERLAP)


benign_df = benign_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
benign_remaining = benign_df.copy()

# VAL: 1 : 5 (phish : benign)
n_val_benign_target = 5 * len(val_recent_df)
n_val_benign = min(n_val_benign_target, len(benign_remaining))
val_benign_df = benign_remaining.iloc[:n_val_benign].copy()
benign_remaining = benign_remaining.iloc[n_val_benign:].copy()

# TEST: 1 : 10 (phish : benign)
n_test_benign_target = 10 * len(test_recent_df)
n_test_benign = min(n_test_benign_target, len(benign_remaining))
test_benign_df = benign_remaining.iloc[:n_test_benign].copy()
benign_remaining = benign_remaining.iloc[n_test_benign:].copy()

print("\nVAL benign:", len(val_benign_df), " (target:", n_val_benign_target, ")")
print("TEST benign:", len(test_benign_df), " (target:", n_test_benign_target, ")")
print("Benign remaining for TRAIN:", len(benign_remaining))


# 6. BUILD TRAIN SET (1:1 BALANCED)
# Training benign will be drawn from the remaining benign pool.
max_train_pairs = min(len(phish_train_pool), len(benign_remaining))
# You get this many 1:1 pairs
train_phish_df  = phish_train_pool.iloc[:max_train_pairs].copy()
train_benign_df = benign_remaining.iloc[:max_train_pairs].copy()

print("\nTRAIN phishing:", len(train_phish_df))
print("TRAIN benign  :", len(train_benign_df))


# 7. CONCAT & SHUFFLE FINAL SPLITS
train_df = pd.concat([train_phish_df, train_benign_df], ignore_index=True)
val_df   = pd.concat([val_recent_df, val_benign_df], ignore_index=True)
test_df  = pd.concat([test_recent_df, test_benign_df], ignore_index=True)

train_df = train_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
val_df   = val_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
test_df  = test_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

print("\nFINAL SPLIT SIZES:")
print("  TRAIN:", len(train_df), " (phish:", (train_df['label'] == 1).sum(),
      ", benign:", (train_df['label'] == 0).sum(), ")")
print("  VAL  :", len(val_df),   " (phish:", (val_df['label'] == 1).sum(),
      ", benign:", (val_df['label'] == 0).sum(), ")")
print("  TEST :", len(test_df),  " (phish:", (test_df['label'] == 1).sum(),
      ", benign:", (test_df['label'] == 0).sum(), ")")


# 8. SAVE
train_df.to_csv("../processed_data/basic_data/train.csv", index=False)
val_df.to_csv("../processed_data/basic_data/val.csv", index=False)
test_df.to_csv("../processed_data/basic_data/testA.csv", index=False)

print("\nSaved train.csv, val.csv, test.csv")


Recent phishing: 46911
Old phishing   : 114299
Benign total   : 492895

Recent phishing split:
  Train recent phish: 32838
  Val phish         : 4691
  Test phish        : 9382

Total phishing available for TRAIN: 147137

VAL benign: 23455  (target: 23455 )
TEST benign: 93820  (target: 93820 )
Benign remaining for TRAIN: 375620

TRAIN phishing: 147137
TRAIN benign  : 147137

FINAL SPLIT SIZES:
  TRAIN: 294274  (phish: 147137 , benign: 147137 )
  VAL  : 28146  (phish: 4691 , benign: 23455 )
  TEST : 103202  (phish: 9382 , benign: 93820 )

Saved train.csv, val.csv, test.csv


In [27]:
import pandas as pd
import re
from urllib.parse import urlparse
from math import log2
import os


# Directories
RAW_DIR = "../processed_data/basic_data/"
OUT_DIR = "../processed_data/engineered_data/"

os.makedirs(OUT_DIR, exist_ok=True)

# Suspicious keywords
SUSPICIOUS_KEYWORDS = [
    "login", "verify", "account", "update", "secure",
    "bank", "signin", "password", "confirm", "safe"
]

malicious_tlds = {".ru", ".tk", ".ml", ".xyz", ".info", ".top", ".ga", ".gq", ".cf"}

# Feature Functions
def is_ip(domain):
    return bool(re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain))

def entropy(s):
    if not s:
        return 0
    p = [s.count(c) / len(s) for c in set(s)]
    return -sum(px * log2(px) for px in p)


def count_subdomains(domain):
    if not domain:
        return 0
    parts = domain.split(".")
    return max(0, len(parts) - 2)


def domain_features(url):
    parsed = urlparse(url)
    domain = parsed.netloc

    if not domain:
        return pd.Series([0, 0])  # domain_length_t2, malicious_tld_flag

    domain_len = len(domain)
    tld = "." + domain.split(".")[-1]
    tld_flag = 1 if tld in malicious_tlds else 0

    return pd.Series([domain_len, tld_flag])


def encoded_char_flag(url):
    encodings = ["%20", "%2F", "%3D", "%3F", "%40", "%25"]
    return 1 if any(e in url for e in encodings) else 0


def path_length_tier2(url):
    parsed = urlparse(url)
    return len(parsed.path) if parsed.path else 0


def extract_tier1_features(url):
    """Extract Level-1 features (your original TIER-1)."""
    try:
        parsed = urlparse(url)
    except:
        return pd.Series([None] * 13)

    domain = parsed.netloc
    path = parsed.path

    url_length        = len(url)
    num_digits        = sum(c.isdigit() for c in url)
    num_special_chars = sum(not c.isalnum() for c in url)
    dot_count         = url.count(".")
    hyphen_in_domain  = 1 if "-" in domain else 0
    at_symbol         = 1 if "@" in url else 0
    double_slash      = 1 if url.count("//") > 1 else 0

    subdomain_count   = domain.count(".") - 1 if domain else 0
    domain_length     = len(domain)
    path_length       = len(path)
    ip_flag           = 1 if is_ip(domain) else 0
    keyword_flag      = 1 if any(k in url.lower() for k in SUSPICIOUS_KEYWORDS) else 0
    url_entropy       = entropy(url)

    return pd.Series([
        url_length, num_digits, num_special_chars, dot_count,
        hyphen_in_domain, at_symbol, double_slash,
        subdomain_count, domain_length, path_length,
        ip_flag, keyword_flag, url_entropy
    ])

TIER1_COLS = [
    "url_length",
    "num_digits",
    "num_special_chars",
    "dot_count",
    "hyphen_in_domain",
    "at_symbol",
    "double_slash",
    "subdomain_count_t1",
    "domain_length_t1",
    "path_length_t1",
    "ip_flag",
    "keyword_flag",
    "url_entropy_t1"
]



# APPLY FEATURES
def apply_features(df):
    df = df.copy()

    # --- Tier 1 ---
    tier1 = df["url"].apply(extract_tier1_features)
    tier1.columns = TIER1_COLS
    df = pd.concat([df, tier1], axis=1)

    # --- Tier 2 ---
    df["url_entropy"] = df["url"].apply(entropy)
    df["subdomain_count"] = df["url"].apply(lambda x: count_subdomains(urlparse(x).netloc))

    df[["domain_length_t2", "malicious_tld_flag"]] = df["url"].apply(domain_features)

    df["encoded_flag"] = df["url"].apply(encoded_char_flag)
    df["path_length_t2"] = df["url"].apply(path_length_tier2)

    return df


# LOAD SPLITS
train = pd.read_csv(RAW_DIR + "train.csv")
val   = pd.read_csv(RAW_DIR + "val.csv")
testA = pd.read_csv(RAW_DIR + "testA.csv")

print("Loaded:")
print(len(train), "train")
print(len(val), "val")
print(len(testA), "testA")



# APPLY TO ALL SPLITS
train_eng = apply_features(train)
val_eng   = apply_features(val)
test_eng  = apply_features(testA)


# SAVE
train_eng.to_csv(OUT_DIR + "train.csv", index=False)
val_eng.to_csv(OUT_DIR + "val.csv", index=False)
test_eng.to_csv(OUT_DIR + "testA.csv", index=False)

print("\nSaved engineered data to:", OUT_DIR)


Loaded:
294274 train
28146 val
103202 testA

Saved engineered data to: ../processed_data/engineered_data/


In [35]:
df = pd.read_csv("PhiUSIIL_Phishing_URL_Dataset.csv")
df.columns = df.columns.str.lower()

df["label"].value_counts()

label
1    134850
0    100945
Name: count, dtype: int64

In [37]:
df = df[["url", "label"]]
df.to_csv("../processed_data/basic_data/testB.csv")