<a href="https://colab.research.google.com/github/Dhanush-sai-reddy/ml-uci-phishing/blob/main/Newone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Phishing Detection using UCI Phishing Websites Dataset

- Downloads dataset from Kaggle via kagglehub
- Trains RandomForest + XGBoost + VotingClassifier
- Extracts URL + HTML/JS-based features for real-time prediction
"""

import os
import re
import warnings
from urllib.parse import urlparse

import kagglehub
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")


def load_dataset():
    """Download and load the UCI phishing dataset from Kaggle."""
    path = kagglehub.dataset_download("isatish/phishing-dataset-uci-ml-csv")
    csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]
    if not csv_files:
        raise FileNotFoundError("No CSV file found in downloaded Kaggle dataset path.")
    df = pd.read_csv(os.path.join(path, csv_files[0]))
    return df


def train_models(df):
    """
    Train RandomForest, XGBoost and a soft Voting classifier.
    Returns: (X_columns, voting_model)
    """
    X = df.drop("Result", axis=1)
    y = (df["Result"] == 1).astype(int)  # Convert {1, -1} -> {1, 0}

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    rf = RandomForestClassifier(
        n_estimators=150, random_state=42, n_jobs=-1
    )
    rf.fit(X_train, y_train)

    xgb = XGBClassifier(
        n_estimators=150,
        max_depth=5,
        learning_rate=0.1,
        eval_metric="logloss",
        random_state=42,
    )
    xgb.fit(X_train, y_train)

    voting = VotingClassifier(
        estimators=[("rf", rf), ("xgb", xgb)],
        voting="soft",
    )
    voting.fit(X_train, y_train)

    print("RandomForest accuracy:", accuracy_score(y_test, rf.predict(X_test)))
    print("XGBoost accuracy:     ", accuracy_score(y_test, xgb.predict(X_test)))
    print("Voting clf accuracy:  ", accuracy_score(y_test, voting.predict(X_test)))

    return X.columns.tolist(), voting


def extract_features(url, feature_names):
    """
    Extract URL + HTML/JS-based features for a given URL.
    Returns a dict with keys matching the dataset feature names.
    """
    features = {}
    parsed = urlparse(url)
    domain = parsed.netloc
    domain_only = domain.replace("www.", "")

    # ----------------- URL-BASED FEATURES -----------------
    # having_IP_Address
    features["having_IP_Address"] = 1 if re.match(
        r"\d+\.\d+\.\d+\.\d+", domain
    ) else -1

    # URL_Length
    L = len(url)
    if L > 75:
        features["URL_Length"] = 1
    elif 54 <= L <= 75:
        features["URL_Length"] = 0
    else:
        features["URL_Length"] = -1

    # Shortining_Service
    shorteners = (
        r"(bit\.ly|goo\.gl|tinyurl|shorte\.st|t\.co|is\.gd|ow\.ly)"
    )
    features["Shortining_Service"] = 1 if re.search(shorteners, url) else -1

    # having_At_Symbol
    features["having_At_Symbol"] = 1 if "@" in url else -1

    # double_slash_redirecting
    features["double_slash_redirecting"] = 1 if url.count("//") > 1 else -1

    # Prefix_Suffix
    features["Prefix_Suffix"] = 1 if "-" in domain else -1

    # having_Sub_Domain
    dots = domain.count(".")
    if dots >= 3:
        features["having_Sub_Domain"] = 1
    elif dots == 2:
        features["having_Sub_Domain"] = 0
    else:
        features["having_Sub_Domain"] = -1

    # HTTPS_token
    features["HTTPS_token"] = 1 if "https" in domain.lower() else -1

    # port
    features["port"] = (
        1 if parsed.port not in [80, 443, None] else -1
    )

    # SSLfinal_State (approx from scheme only)
    features["SSLfinal_State"] = (
        1 if parsed.scheme == "https" else -1
    )

    # ----------------- HTML / JS FEATURES -----------------
    try:
        response = requests.get(url, timeout=4)
        html = response.text
        soup = BeautifulSoup(html, "lxml")
    except Exception:
        # If page can't be loaded, fill missing with 0 later
        for col in feature_names:
            if col not in features:
                features[col] = 0
        return features

    anchors = soup.find_all("a", href=True)
    imgs = soup.find_all("img", src=True)
    scripts = soup.find_all("script", src=True)
    links = soup.find_all("link", href=True)
    iframes = soup.find_all("iframe")

    # Request_URL
    total = len(imgs) + len(scripts)
    external = 0
    for tag in imgs + scripts:
        src = tag.get("src")
        if src and src.startswith("http") and domain_only not in src:
            external += 1
    ratio = external / total if total != 0 else 0
    if ratio > 0.61:
        features["Request_URL"] = 1
    elif 0.22 <= ratio <= 0.61:
        features["Request_URL"] = 0
    else:
        features["Request_URL"] = -1

    # URL_of_Anchor
    total = len(anchors)
    external = 0
    for a in anchors:
        href = a["href"]
        if href.startswith("http") and domain_only not in href:
            external += 1
    ratio = external / total if total != 0 else 0
    if ratio > 0.67:
        features["URL_of_Anchor"] = 1
    elif ratio >= 0.31:
        features["URL_of_Anchor"] = 0
    else:
        features["URL_of_Anchor"] = -1

    # Links_in_tags
    total = len(scripts) + len(links)
    external = 0
    for tag in scripts + links:
        src = tag.get("src") or tag.get("href")
        if src and src.startswith("http") and domain_only not in src:
            external += 1
    ratio = external / total if total != 0 else 0
    if ratio > 0.61:
        features["Links_in_tags"] = 1
    elif ratio >= 0.22:
        features["Links_in_tags"] = 0
    else:
        features["Links_in_tags"] = -1

    # SFH
    forms = soup.find_all("form")
    if len(forms) == 0:
        features["SFH"] = 1
    else:
        action = forms[0].get("action", "")
        if action in ["", "about:blank"]:
            features["SFH"] = 1
        elif domain_only not in action:
            features["SFH"] = 1
        else:
            features["SFH"] = -1

    # Submitting_to_email
    features["Submitting_to_email"] = 1 if "mailto:" in html else -1

    # Abnormal_URL
    features["Abnormal_URL"] = -1 if domain_only in html else 1

    # Redirect
    if "window.location" in html or "meta refresh" in html.lower():
        features["Redirect"] = 1
    else:
        features["Redirect"] = -1

    # on_mouseover
    features["on_mouseover"] = 1 if "onmouseover" in html.lower() else -1

    # RightClick
    features["RightClick"] = 1 if "event.button==2" in html else -1

    # popUpWidnow
    features["popUpWidnow"] = 1 if "window.open" in html else -1

    # Iframe
    features["Iframe"] = 1 if len(iframes) > 0 else -1

    # Any dataset features not explicitly set: default 0
    for col in feature_names:
        if col not in features:
            features[col] = 0

    return features


def predict_url(url, feature_names, model):
    """Return prediction and probability for a given URL."""
    feat = extract_features(url, feature_names)
    feat_df = pd.DataFrame([feat])[feature_names]
    prob = model.predict_proba(feat_df)[0, 1]
    pred = model.predict(feat_df)[0]
    label = "PHISHING" if pred == 1 else "LEGITIMATE"
    return label, float(prob)


def main():
    df = load_dataset()
    print("Dataset loaded:", df.shape)

    feature_names, model = train_models(df)

    # Example test URLs
    test_urls = [
        "https://google.com",
        "http://198.54.23.11/login/secure",
        "https://paypal-security-update.com/login",
        "http://bit.ly/3xYz",
    ]

    for u in test_urls:
        label, prob = predict_url(u, feature_names, model)
        print(f"\nURL: {u}")
        print(f"Prediction: {label}")
        print(f"Phishing probability: {prob:.4f}")


if __name__ == "__main__":
    main()