<a href="https://colab.research.google.com/github/Dhanush-sai-reddy/ml-uci-phishing/blob/main/ml-uci-phishing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import kagglehub

# Download latest version
path = kagglehub.dataset_download("isatish/phishing-dataset-uci-ml-csv")

print("Path to dataset files:", path)

import pandas as pd
import os
for file in os.listdir(path):
    if file.endswith(".csv"):
        csv_path = os.path.join(path, file)
        break

# Load dataset
df = pd.read_csv(csv_path)

df.head()


In [None]:
def predict_combined_url(url):
    url_feats = extract_url_features(url)
    content_feats = extract_content_features(url)

    # Combine all features, ensuring correct order based on all_features list
    combined_data = {**url_feats, **content_feats}
    # Ensure features are in the same order as `X_combined` used for training
    df_predict = pd.DataFrame([combined_data])[all_features]

    # Make prediction using the hybrid model
    prediction = hybrid_combined.predict(df_predict)[0]

    if prediction == 0:
        print("ðŸ”´ PHISHING URL")
    else:
        print("ðŸŸ¢ LEGITIMATE URL")

Let's test the new `predict_combined_url` function with a known phishing URL and a legitimate URL.

In [None]:
df = df.drop("id", axis=1)
df.head()


In [None]:
url_features = [
    "having_IP_Address",
    "URL_Length",
    "Shortining_Service",
    "having_At_Symbol",
    "double_slash_redirecting",
    "Prefix_Suffix",
    "having_Sub_Domain",
    "SSLfinal_State",
    "Domain_registeration_length",
    "HTTPS_token"
]


In [None]:
X = df[url_features]
y = df["Result"]

X.head()


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=150)
rf.fit(X_train, y_train)


In [None]:
pred = rf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

In [None]:
!pip install tldextract


In [None]:
import re
import tldextract
import requests
from datetime import datetime
import pandas as pd

def extract_url_features(url):
    # -------------------------------
    # 1. having_IP_Address
    # -------------------------------
    ip_pattern = r'\d+\.\d+\.\d+\.\d+'
    having_ip = -1 if re.search(ip_pattern, url) else 1

    # -------------------------------
    # 2. URL_Length
    # -------------------------------
    url_len = len(url)
    if url_len < 54:
        url_length = 1
    elif 54 <= url_len <= 75:
        url_length = 0
    else:
        url_length = -1

    # -------------------------------
    # 3. Shortining_Service
    # -------------------------------
    shorteners = ["bit.ly", "tinyurl", "goo.gl", "t.co", "is.gd", "buff.ly"]
    short_service = -1 if any(s in url for s in shorteners) else 1

    # -------------------------------
    # 4. having_At_Symbol
    # -------------------------------
    at_symbol = -1 if "@" in url else 1

    # -------------------------------
    # 5. double_slash_redirecting
    # -------------------------------
    pos = url.find("//")
    double_slash = -1 if pos > 6 else 1

    # -------------------------------
    # 6. Prefix_Suffix (- in domain)
    # -------------------------------
    prefix_suffix = -1 if "-" in url else 1

    # -------------------------------
    # 7. having_Sub_Domain
    # -------------------------------
    ext = tldextract.extract(url)
    sub = ext.subdomain

    if sub == "":
        subdomain = 1
    elif sub.count(".") == 0:
        subdomain = 0
    else:
        subdomain = -1

    # -------------------------------
    # 8. SSLfinal_State (https or http)
    # -------------------------------
    if url.startswith("https"):
        SSLfinal_State = 1   # secure
    else:
        SSLfinal_State = -1  # insecure

    # -------------------------------
    # 9. Domain_registeration_length (approximation)
    # -------------------------------
    # Real domain age requires WHOIS, we approximate:
    domain = ext.domain + "." + ext.suffix
    if len(domain) < 5:
        Domain_registration_length = -1
    else:
        Domain_registration_length = 1

    # -------------------------------
    # 10. HTTPS_token (fake https inside URL)
    # -------------------------------
    if "https" in url[8:]:  # after http://
        HTTPS_token = -1
    else:
        HTTPS_token = 1

    return {
        "having_IP_Address": having_ip,
        "URL_Length": url_length,
        "Shortining_Service": short_service,
        "having_At_Symbol": at_symbol,
        "double_slash_redirecting": double_slash,
        "Prefix_Suffix": prefix_suffix,
        "having_Sub_Domain": subdomain,
        "SSLfinal_State": SSLfinal_State,
        "Domain_registeration_length": Domain_registration_length,
        "HTTPS_token": HTTPS_token
    }


In [None]:
def predict_url(url):
    features = extract_url_features(url)
    df_test = pd.DataFrame([features])
    result = rf.predict(df_test)[0]

    if result == -1:
        print("ðŸ”´ PHISHING URL")
    else:
        print("ðŸŸ¢ LEGITIMATE URL")


In [None]:
predict_url("http://paypal.com.verify-update-security-login.com")


In [None]:
predict_url("www.google.com")

In [None]:
predict_url("https://chatgpt.com/c/6917625d-fd3c-8324-99f2-e556b96116fe")

In [None]:
predict_url("https://www.lenovo.com/in/en/p/accessories-and-software/chargers-and-batteries/chargers/gx20p92532")

In [None]:
content_features = [
    "Request_URL",
    "URL_of_Anchor",
    "Links_in_tags",
    "SFH",
    "Redirect",
    "popUpWidnow",
    "Iframe",
    "age_of_domain",
    "DNSRecord",
    "web_traffic",
    "Page_Rank",
    "Google_Index"
]


In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss'
)


In [None]:
# Convert labels for XGBoost and Hybrid model
y_train_xgb = y_train.replace({-1: 0})
y_test_xgb  = y_test.replace({-1: 0})


In [None]:
xgb.fit(X_train, y_train_xgb)
from sklearn.ensemble import VotingClassifier

hybrid = VotingClassifier(
    estimators=[("rf", rf), ("xgb", xgb)],
    voting="soft"
)

# Remember: XGBoost needs labels 0/1, not -1/1
y_hybrid = y.replace({-1: 0})

hybrid.fit(X, y_hybrid)


In [None]:
pred = hybrid.predict(X_test)

pred_final = pd.Series(pred).replace({0: -1})

from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, pred_final))
print(classification_report(y_test, pred_final))


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse


In [None]:
def scrape_page(url):
    try:
        r = requests.get(url, timeout=5, headers={"User-Agent": "Mozilla/5.0"})
        return r.text
    except:
        return None


In [None]:
def get_soup(url):
    html = scrape_page(url)
    if html is None:
        return None
    return BeautifulSoup(html, "html.parser")


In [None]:
def count_external_links(soup, domain):
    if soup is None:
        return 0

    links = soup.find_all("a", href=True)
    count = 0
    for tag in links:
        try:
            if domain not in tag["href"]:
                count += 1
        except:
            pass
    return count


In [None]:
def count_iframes(soup):
    if soup is None:
        return 0
    return len(soup.find_all("iframe"))


In [None]:
def count_popups(soup):
    if soup is None:
        return 0
    scripts = soup.find_all("script")
    count = 0
    for s in scripts:
        code = s.text.lower()
        if "window.open" in code or "alert(" in code:
            count += 1
    return count


In [None]:
def right_click_disabled(soup):
    if soup is None:
        return 0
    page = str(soup).lower()
    return 1 if "event.button==2" in page or "contextmenu" in page else 0


In [None]:
def detect_redirect(soup):
    if soup is None:
        return 0
    meta = soup.find("meta", attrs={"http-equiv": "refresh"})
    return 1 if meta else 0


In [None]:
def extract_content_features(url):
    soup = get_soup(url)
    domain = urlparse(url).netloc

    # Default values for features that are hard to extract without external APIs
    # Assigning '1' as a neutral/legitimate indicator for now,
    # as the original dataset uses -1, 0, 1 and '1' generally means legitimate.
    # These would ideally require more sophisticated methods (WHOIS, DNS lookup, etc.)
    age_of_domain = 1
    DNSRecord = 1
    web_traffic = 1
    Page_Rank = 1
    Google_Index = 1

    # --- Features that can be extracted or approximated from content ----
    iframe_feature = count_iframes(soup)
    popup_feature = count_popups(soup)
    redirect_feature = detect_redirect(soup)

    Request_URL = 1 # Simplified: default to legitimate
    URL_of_Anchor = 1 # Simplified: default to legitimate
    Links_in_tags = 1 # Simplified: default to legitimate
    SFH = 1 # Simplified: default to legitimate

    if soup:
        # Request_URL: Proportion of objects requested from external URL
        # For simplicity, count external links in images, scripts, etc.
        total_objects = 0
        external_objects = 0
        for tag in soup.find_all(['img', 'script', 'link']):
            src_or_href = tag.get('src') or tag.get('href')
            if src_or_href:
                total_objects += 1
                parsed_src_or_href = urlparse(src_or_href)
                if parsed_src_or_href.netloc and parsed_src_or_href.netloc != domain:
                    external_objects += 1
        if total_objects > 0:
            if (external_objects / total_objects) > 0.5: # Arbitrary threshold
                Request_URL = -1
            elif (external_objects / total_objects) > 0:
                Request_URL = 0
            else:
                Request_URL = 1
        else:
            Request_URL = 1 # No objects found, consider legitimate

        # URL_of_Anchor: Percentage of anchor tags pointing to different domains
        all_anchors = soup.find_all('a', href=True)
        total_anchors = len(all_anchors)
        external_anchors = 0
        if total_anchors > 0:
            for anchor in all_anchors:
                href = anchor['href']
                parsed_href = urlparse(href)
                if parsed_href.netloc and parsed_href.netloc != domain:
                    external_anchors += 1
            if (external_anchors / total_anchors) > 0.6: # Arbitrary threshold
                URL_of_Anchor = -1
            elif (external_anchors / total_anchors) > 0.3:
                URL_of_Anchor = 0
            else:
                URL_of_Anchor = 1
        else:
            URL_of_Anchor = 1 # No anchors found, consider legitimate

        # Links_in_tags: Percentage of links contained in <meta>, <script>, and <link> tags
        meta_script_link_tags = soup.find_all(['meta', 'script', 'link'], href=True) + soup.find_all('script', src=True)
        total_tags = len(meta_script_link_tags)
        external_links_in_tags = 0
        if total_tags > 0:
            for tag in meta_script_link_tags:
                href_or_src = tag.get('href') or tag.get('src')
                if href_or_src:
                    parsed_link = urlparse(href_or_src)
                    if parsed_link.netloc and parsed_link.netloc != domain:
                        external_links_in_tags += 1
            if (external_links_in_tags / total_tags) > 0.5:
                Links_in_tags = -1
            elif (external_links_in_tags / total_tags) > 0:
                Links_in_tags = 0
            else:
                Links_in_tags = 1
        else:
            Links_in_tags = 1 # No relevant tags found, consider legitimate

        # SFH (Server Form Handler): If the form action is to an external domain or is blank.
        forms = soup.find_all('form')
        if forms:
            sfh_suspicious = False
            for form in forms:
                action = form.get('action')
                if action is None or action == '': # Blank action is suspicious
                    sfh_suspicious = True
                    break
                parsed_action = urlparse(action)
                if parsed_action.netloc and parsed_action.netloc != domain: # External action is suspicious
                    sfh_suspicious = True
                    break
            if sfh_suspicious:
                SFH = -1
            else:
                SFH = 1
        else:
            SFH = 1 # No forms, consider legitimate

    return {
        "Request_URL": Request_URL,
        "URL_of_Anchor": URL_of_Anchor,
        "Links_in_tags": Links_in_tags,
        "SFH": SFH,
        "Redirect": redirect_feature,
        "popUpWidnow": popup_feature,
        "Iframe": iframe_feature,
        "age_of_domain": age_of_domain,
        "DNSRecord": DNSRecord,
        "web_traffic": web_traffic,
        "Page_Rank": Page_Rank,
        "Google_Index": Google_Index
    }

In [None]:
url = "https://www.google.com"
features = extract_content_features(url)
features


In [None]:
all_features = url_features + content_features
all_features = list(set(all_features))

print("Combined features (no duplicates):", all_features)

In [None]:
X_combined = df[all_features]
y_combined = df["Result"]

X_combined.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=42
)

print("Data split into training and testing sets.")

In [None]:
from xgboost import XGBClassifier

y_train_combined_xgb = y_train_combined.replace({-1: 0})
y_test_combined_xgb  = y_test_combined.replace({-1: 0})

xgb_combined = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss'
)
xgb_combined.fit(X_train_combined, y_train_combined_xgb)
print("XGBoost model trained on combined features.")

In [None]:
from sklearn.ensemble import VotingClassifier

y_hybrid_combined = y_combined.replace({-1: 0})

hybrid_combined = VotingClassifier(
    estimators=[("rf_combined", rf_combined), ("xgb_combined", xgb_combined)],
    voting="soft"
)

hybrid_combined.fit(X_combined, y_hybrid_combined)
print("Hybrid model trained on combined features.")

In [None]:
pred_hybrid_combined = hybrid_combined.predict(X_test_combined)
pred_hybrid_combined_final = pd.Series(pred_hybrid_combined).replace({0: -1})

from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test_combined, pred_hybrid_combined_final))
print(classification_report(y_test_combined, pred_hybrid_combined_final))