<a href="https://colab.research.google.com/github/Dhanush-sai-reddy/ml-uci-phishing/blob/main/Newone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
#  COMPLETE PHISHING DETECTION SYSTEM IN ONE CELL
# ============================================================

!pip install kagglehub xgboost requests beautifulsoup4 lxml --quiet

import kagglehub
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import re
import warnings

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

warnings.filterwarnings("ignore")

# ============================================================
# 1. DOWNLOAD DATASET
# ============================================================
path = kagglehub.dataset_download("isatish/phishing-dataset-uci-ml-csv")
import os
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]
df = pd.read_csv(os.path.join(path, csv_files[0]))

print("Dataset Loaded:", df.shape)
print("Columns:", df.columns.tolist())

# ============================================================
# 2. DATA PREP
# ============================================================
X = df.drop("Result", axis=1)
y = (df["Result"] == 1).astype(int)  # Convert {1,-1} â†’ {1,0}

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================================
# 3. TRAIN MODELS
# ============================================================

rf = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

xgb = XGBClassifier(
    n_estimators=150, max_depth=5, learning_rate=0.1,
    eval_metric="logloss", random_state=42
)
xgb.fit(X_train, y_train)

voting = VotingClassifier(
    estimators=[("rf", rf), ("xgb", xgb)],
    voting="soft"
)
voting.fit(X_train, y_train)

print("\nTraining Complete!")
print("RF:", accuracy_score(y_test, rf.predict(X_test)))
print("XGB:", accuracy_score(y_test, xgb.predict(X_test)))
print("Voting:", accuracy_score(y_test, voting.predict(X_test)))

# ============================================================
# 4. FEATURE EXTRACTOR (URL + HTML Content)
# ============================================================

def extract_features(url):
    features = {}
    parsed = urlparse(url)
    domain = parsed.netloc
    domain_only = domain.replace("www.", "")

    # --- URL FEATURES ---
    features['having_IP_Address'] = 1 if re.match(r'\d+\.\d+\.\d+\.\d+', domain) else -1

    L = len(url)
    features['URL_Length'] = 1 if L > 75 else (0 if 54 <= L <= 75 else -1)

    shorteners = r"(bit\.ly|goo\.gl|tinyurl|shorte\.st|t\.co|is\.gd|ow\.ly)"
    features['Shortining_Service'] = 1 if re.search(shorteners, url) else -1

    features['having_At_Symbol'] = 1 if "@" in url else -1
    features['double_slash_redirecting'] = 1 if url.count("//") > 1 else -1
    features['Prefix_Suffix'] = 1 if "-" in domain else -1

    dots = domain.count(".")
    features['having_Sub_Domain'] = 1 if dots >= 3 else (0 if dots == 2 else -1)

    features['HTTPS_token'] = 1 if "https" in domain.lower() else -1
    features['port'] = 1 if parsed.port not in [80, 443, None] else -1
    features['SSLfinal_State'] = 1 if parsed.scheme == "https" else -1

    # --- HTML FETCH ---
    try:
        response = requests.get(url, timeout=4)
        html = response.text
        soup = BeautifulSoup(html, "lxml")
    except:
        # Fill missing HTML features
        for col in X.columns:
            if col not in features:
                features[col] = 0
        return features

    # ------------------------------
    # HTML CONTENT FEATURES
    # ------------------------------

    # Collect tags
    anchors = soup.find_all("a", href=True)
    imgs = soup.find_all("img", src=True)
    scripts = soup.find_all("script", src=True)
    links = soup.find_all("link", href=True)
    iframes = soup.find_all("iframe")

    # --- Request_URL ---
    total = len(imgs) + len(scripts)
    external = 0
    for tag in imgs + scripts:
        src = tag.get("src")
        if src and src.startswith("http") and domain_only not in src:
            external += 1
    ratio = external / total if total != 0 else 0
    features["Request_URL"] = 1 if ratio > 0.61 else (0 if 0.22 <= ratio <= 0.61 else -1)

    # --- URL_of_Anchor ---
    total = len(anchors)
    external = 0
    for a in anchors:
        href = a["href"]
        if href.startswith("http") and domain_only not in href:
            external += 1
    ratio = external / total if total != 0 else 0
    features["URL_of_Anchor"] = 1 if ratio > 0.67 else (0 if ratio >= 0.31 else -1)

    # --- Links_in_tags ---
    total = len(scripts) + len(links)
    external = 0
    for tag in scripts + links:
        src = tag.get("src") or tag.get("href")
        if src and src.startswith("http") and domain_only not in src:
            external += 1
    ratio = external / total if total else 0
    features["Links_in_tags"] = 1 if ratio > 0.61 else (0 if ratio >= 0.22 else -1)

    # --- SFH ---
    forms = soup.find_all("form")
    if len(forms) == 0:
        features["SFH"] = 1
    else:
        action = forms[0].get("action", "")
        if action in ["", "about:blank"]:
            features["SFH"] = 1
        elif domain_only not in action:
            features["SFH"] = 1
        else:
            features["SFH"] = -1

    features["Submitting_to_email"] = 1 if "mailto:" in html else -1
    features["Abnormal_URL"] = -1 if domain_only in html else 1

    features["Redirect"] = 1 if "window.location" in html or "meta refresh" in html.lower() else -1
    features["on_mouseover"] = 1 if "onmouseover" in html.lower() else -1
    features["RightClick"] = 1 if "event.button==2" in html else -1
    features["popUpWidnow"] = 1 if "window.open" in html else -1
    features["Iframe"] = 1 if len(iframes) > 0 else -1

    # Fill missing
    for col in X.columns:
        if col not in features:
            features[col] = 0

    return features

# ============================================================
# 5. FINAL PREDICTION FUNCTION
# ============================================================

def predict_url(url):
    feat = extract_features(url)
    feat_df = pd.DataFrame([feat])[X.columns]

    prob = voting.predict_proba(feat_df)[0]
    pred = voting.predict(feat_df)[0]

    print("\n======================================")
    print("URL:", url)
    print("Phishing Probability:", prob[1])
    print("Prediction:", "PHISHING" if pred == 1 else "LEGITIMATE")
    print("======================================\n")

    return {"url": url, "prob": prob[1], "prediction": "PHISHING" if pred == 1 else "LEGITIMATE"}

# ============================================================
# 6. TEST URLs
# ============================================================

test_urls = [
    "https://google.com",
    "http://198.54.23.11/login/secure",
    "https://paypal-security-update.com/login",
    "http://bit.ly/3xYz"
]

for u in test_urls:
    predict_url(u)

print("System Ready. Use predict_url('your_url') to check any site.")