<a href="https://colab.research.google.com/github/Dhanush-sai-reddy/ml-uci-phishing/blob/main/0.977.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:

import os
import re
import pickle
import warnings
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import kagglehub

warnings.filterwarnings("ignore")

# ============================
# 1. LOAD UCI DATASET
# ============================

def load_dataset():
    print("Downloading UCI dataset from KaggleHub...")
    path = kagglehub.dataset_download("isatish/phishing-dataset-uci-ml-csv")
    csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]

    if not csv_files:
        raise Exception("No CSV found in downloaded dataset folder.")

    df = pd.read_csv(os.path.join(path, csv_files[0]))
    print("Dataset Loaded:", df.shape)
    return df


# ============================
# 2. TRAIN MODEL
# ============================

def train_model(df):
    X = df.drop(["Result", "id"], axis=1) # Exclude 'id' column
    y = (df["Result"] == 1).astype(int)  # convert {-1,1} â†’ {0,1}

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train, y_train)

    xgb = XGBClassifier(
        n_estimators=250,
        max_depth=7,
        learning_rate=0.1,
        eval_metric="logloss",
        random_state=42
    )
    xgb.fit(X_train, y_train)

    voting = VotingClassifier(
        estimators=[("rf", rf), ("xgb", xgb)],
        voting="soft"
    )
    voting.fit(X_train, y_train)

    print("RandomForest:", accuracy_score(y_test, rf.predict(X_test)))
    print("XGBoost:     ", accuracy_score(y_test, xgb.predict(X_test)))
    print("VotingClf:   ", accuracy_score(y_test, voting.predict(X_test)))

    return voting, X.columns.tolist()


# ============================
# 3. SAVE MODEL
# ============================

def save_model(model, filename="model.pkl"):
    with open(filename, "wb") as f:
        pickle.dump(model, f)
    print("Model saved as:", filename)


# ============================
# 4. FEATURE EXTRACTION (URL + HTML/JS)
# ============================

UCI_FEATURES = [
    "having_IP_Address", "URL_Length", "Shortining_Service", "having_At_Symbol",
    "double_slash_redirecting", "Prefix_Suffix", "having_Sub_Domain",
    "SSLfinal_State", "Domain_registeration_length", "Favicon", "port",
    "HTTPS_token", "Request_URL", "URL_of_Anchor", "Links_in_tags", "SFH",
    "Submitting_to_email", "Abnormal_URL", "Redirect", "on_mouseover",
    "RightClick", "popUpWidnow", "Iframe", "age_of_domain", "DNSRecord",
    "web_traffic", "Page_Rank", "Google_Index", "Links_pointing_to_page",
    "Statistical_report"
]

def extract_features(url):
    features = {}
    p = urlparse(url)
    domain = p.netloc
    dom = domain.replace("www.", "")

    # ---------- URL-based ----------
    features["having_IP_Address"] = 1 if re.match(r"\d+\.\d+\.\d+\.\d+", domain) else -1

    L = len(url)
    features["URL_Length"] = 1 if L > 75 else (0 if L >= 54 else -1)

    features["Shortining_Service"] = 1 if re.search(r"(bit\.ly|goo\.gl|tinyurl|is\.gd|t\.co)", url) else -1
    features["having_At_Symbol"] = 1 if "@" in url else -1
    features["double_slash_redirecting"] = 1 if url.count("//") > 1 else -1
    features["Prefix_Suffix"] = 1 if "-" in domain else -1

    dots = domain.count(".")
    features["having_Sub_Domain"] = 1 if dots >= 3 else (0 if dots == 2 else -1)

    features["SSLfinal_State"] = 1 if p.scheme == "https" else -1
    features["HTTPS_token"] = 1 if "https" in domain.lower() else -1
    features["port"] = 1 if p.port not in [80, 443, None] else -1

    # Placeholder features (dataset-only)
    placeholder_features = [
        "Domain_registeration_length", "Favicon", "age_of_domain", "DNSRecord",
        "web_traffic", "Page_Rank", "Google_Index", "Links_pointing_to_page",
        "Statistical_report"
    ]
    for pf in placeholder_features:
        features.setdefault(pf, 0)

    # ---------- HTML-based ----------
    try:
        r = requests.get(url, timeout=4)
        html = r.text
        soup = BeautifulSoup(html, "lxml")
    except:
        # Fill missing HTML features with 0
        for f in UCI_FEATURES:
            features.setdefault(f, 0)
        return features

    anchors = soup.find_all("a", href=True)
    imgs = soup.find_all("img")
    scripts = soup.find_all("script")
    links = soup.find_all("link")
    iframes = soup.find_all("iframe")

    # Request_URL
    total = len(imgs) + len(scripts)
    ext = 0
    for tag in imgs + scripts:
        src = tag.get("src") or ""
        if src.startswith("http") and dom not in src:
            ext += 1
    r = ext / total if total else 0
    features["Request_URL"] = 1 if r > 0.61 else (0 if r >= 0.22 else -1)

    # URL_of_Anchor
    total = len(anchors)
    ext = 0
    for a in anchors:
        h = a["href"]
        if h.startswith("http") and dom not in h:
            ext += 1
    r = ext / total if total else 0
    features["URL_of_Anchor"] = 1 if r > 0.67 else (0 if r >= 0.31 else -1)

    # Links_in_tags
    total = len(links) + len(scripts)
    ext = 0
    for tag in scripts + links:
        src = tag.get("src") or tag.get("href") or ""
        if src.startswith("http") and dom not in src:
            ext += 1
    r = ext / total if total else 0
    features["Links_in_tags"] = 1 if r > 0.61 else (0 if r >= 0.22 else -1)

    # SFH
    forms = soup.find_all("form")
    if len(forms) == 0:
        features["SFH"] = 1
    else:
        action = forms[0].get("action") or ""
        if action in ["", "about:blank"]:
            features["SFH"] = 1
        elif dom not in action:
            features["SFH"] = 1
        else:
            features["SFH"] = -1

    features["Submitting_to_email"] = 1 if "mailto:" in html else -1
    features["Abnormal_URL"] = -1 if dom in html else 1
    features["Redirect"] = 1 if ("window.location" in html or "meta refresh" in html.lower()) else -1
    features["on_mouseover"] = 1 if "onmouseover" in html.lower() else -1
    features["RightClick"] = 1 if "event.button==2" in html else -1
    features["popUpWidnow"] = 1 if "window.open" in html else -1
    features["Iframe"] = 1 if len(iframes) else -1

    # Ensure all features exist
    for f in UCI_FEATURES:
        features.setdefault(f, 0)

    return features


# ============================
# 5. PREDICT
# ============================

def predict_url(url, model, feature_cols):
    f = extract_features(url)
    df = pd.DataFrame([f])[feature_cols]
    prob = model.predict_proba(df)[0][1]
    pred = model.predict(df)[0]
    return pred, prob


# ============================
# MAIN
# ============================

if __name__ == "__main__":
    df = load_dataset()
    model, cols = train_model(df)
    save_model(model)

    # Test examples
    test_urls = [
        "https://google.com",
        "http://198.54.23.11/login/update",
        "https://paypal-security-alert.com/verify",
        "http://bit.ly/2fSdq"
    ]

    for u in test_urls:
        pred, prob = predict_url(u, model, cols)
        print("\nURL:", u)
        print("Prediction:", "PHISHING" if pred == 1 else "LEGITIMATE")
        print("Probability:", prob)

Downloading UCI dataset from KaggleHub...
Using Colab cache for faster access to the 'phishing-dataset-uci-ml-csv' dataset.
Dataset Loaded: (11055, 32)
RandomForest: 0.9742198100407056
XGBoost:      0.9751243781094527
VotingClf:    0.9773857982813207
Model saved as: model.pkl

URL: https://google.com
Prediction: LEGITIMATE
Probability: 0.10271844291768502

URL: http://198.54.23.11/login/update
Prediction: LEGITIMATE
Probability: 0.3127016168832779

URL: https://paypal-security-alert.com/verify
Prediction: PHISHING
Probability: 0.897551656961441

URL: http://bit.ly/2fSdq
Prediction: LEGITIMATE
Probability: 0.09514944525901228
