In [None]:
import pandas as pd
import numpy as np
import re
import math
from collections import Counter
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('new_urls.csv') #loading the data
print(df.head())#
print(f"\nDataset shape: {df.shape}")

print(df['status'].value_counts())#distribution

                                       url  status
0  0000111servicehelpdesk.godaddysites.com       0
1     000011accesswebform.godaddysites.com       0
2                             00003.online       0
3      0009servicedeskowa.godaddysites.com       0
4                     000n38p.wcomhost.com       0

Dataset shape: (822010, 2)
status
1    427028
0    394982
Name: count, dtype: int64


In [None]:
def calculate_entropy(s): #Calculate the Shannon entropy(randomness of strings) of a string.
    if not s or not isinstance(s, str): return 0
    probabilities = [count / len(s) for _, count in Counter(s).most_common()]
    entropy = -sum(p * math.log2(p) for p in probabilities if p > 0)
    return entropy

TOP_DOMAINS = {
    "google.com", "youtube.com", "facebook.com", "amazon.com", "wikipedia.org",
    "twitter.com", "instagram.com", "linkedin.com", "microsoft.com", "apple.com",
    "netflix.com", "paypal.com", "yahoo.com", "reddit.com", "office.com",
    "github.com", "bankofamerica.com"
}

def extract_features_final_robust(url): #extracting all the 24 features

    if not re.match(r'^https?://', url): # prepend 'http://' to it.
        url = 'http://' + url

    try:
        features = []
        parsed_url = urlparse(url)
        hostname = parsed_url.netloc if parsed_url.netloc else ''
        path = parsed_url.path if parsed_url.path else ''
        query = parsed_url.query if parsed_url.query else ''

        features.extend([len(url), len(hostname), len(path), len(query), len(parsed_url.fragment), url.count('-'), url.count('@'), url.count('?'), url.count('&'), url.count('='), url.count('.')])
        cleaned_hostname = hostname.replace('www.', '') if hostname.startswith('www.') else hostname
        features.append(cleaned_hostname.count('.'))
        features.append(1 if parsed_url.scheme == 'https' else 0)
        has_ip = re.search(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', hostname)
        features.append(1 if has_ip else 0)
        port = parsed_url.port
        features.extend([1 if port and port not in [80, 443] else 0, 1 if hostname.startswith('xn--') else 0, calculate_entropy(hostname), sum(c.isdigit() for c in hostname) if not has_ip else 0])
        brand_keywords = ['google', 'facebook', 'apple', 'paypal', 'amazon', 'microsoft', 'bank', 'ebay']
        features.append(1 if any(b in cleaned_hostname and not cleaned_hostname.startswith(b) for b in brand_keywords) else 0)
        shortener_keywords = ['bit.ly', 'goo.gl', 't.co', 'ow.ly', 'tinyurl']
        features.append(1 if any(sk in hostname for sk in shortener_keywords) else 0)
        parts = hostname.split('.')
        main_domain = ".".join(parts[-2:]) if len(parts) > 1 else hostname
        features.append(1 if main_domain in TOP_DOMAINS else 0)
        vowels = "aeiou"
        num_vowels = sum(1 for char in hostname if char in vowels)
        num_consonants = sum(1 for char in hostname if char.isalpha() and char not in vowels)
        features.append(num_vowels / (num_consonants + 1e-6))
        digit_sequences = re.findall(r'\d+', hostname)
        features.append(max(len(s) for s in digit_sequences) if digit_sequences else 0)
        consonant_sequences = re.findall(r'[^aeiou\d\W_]+', hostname, re.IGNORECASE)
        features.append(max(len(s) for s in consonant_sequences) if consonant_sequences else 0)
        if len(features) != 24: return [0] * 24
        return features

    except Exception:
        return [0] * 24 # handles unexpected errors

final_feature_names = [
    'url_length', 'hostname_length', 'path_length', 'query_length', 'fragment_length',
    'count_-', 'count_@', 'count_?', 'count_&', 'count_=', 'count_.',
    'num_subdomains', 'has_https', 'has_ip', 'has_uncommon_port',
    'has_punycode', 'hostname_entropy', 'digits_in_hostname', 'contains_deceptive_brand',
    'has_shortener', 'is_top_domain', 'vowel_consonant_ratio', 'longest_digit_seq', 'longest_consonant_seq'
]
print("feature extraction function is defined.")

print("\nExtracting final set of 24 features.")
features_list = df['url'].apply(extract_features_final_robust).tolist()
features_df = pd.DataFrame(features_list, columns=final_feature_names)
print("Feature extraction complete.") #extract the final fearture set

# train/test split
X = features_df
y = df['status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#training the random forest
print("\ntraining a stable RandomForestClassifier model")
model = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=42, n_jobs=-1)#used tested parameters
model.fit(X_train, y_train)
print("model training complete")

#result evaluation
y_pred_final = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_final):.4f}")
print("\nclassification report:")
print(classification_report(y_test, y_pred_final, target_names=['Phishing', 'Legitimate']))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_final))
print("final model is ready")

feature extraction function is defined.

Extracting final set of 24 features.
Feature extraction complete.

training a stable RandomForestClassifier model
model training complete
Accuracy: 0.9085

classification report:
              precision    recall  f1-score   support

    Phishing       0.95      0.86      0.90     79122
  Legitimate       0.88      0.96      0.92     85280

    accuracy                           0.91    164402
   macro avg       0.91      0.91      0.91    164402
weighted avg       0.91      0.91      0.91    164402


Confusion Matrix:
 [[67778 11344]
 [ 3694 81586]]
final model is ready


In [None]:
def predict_url(url_to_check):# returns probability of phishing
    url_features = extract_features_final_robust(url_to_check)#use final feature extraction
    url_features_df = pd.DataFrame([url_features], columns=final_feature_names)
    phishing_probability = model.predict_proba(url_features_df)[0][0] #use of model for prediction

    return phishing_probability

     #testing
test_urls = [
    # Legit
    "https://www.unicef.org/",
    "https://en.wikipedia.org/wiki/Machine_learning",
    "https://github.com/pandas-dev/pandas",

    # phishing
    "http://145.14.144.99/login.html",
    "http://my-bank-online.com@123.45.67.89/login",
    "https://update-account-verification-required.com/login",
    "http://secure-login-apple-id.com-input.info/private/index.html",

    # tricky
    "https://www.amaz0n-support.com/update-details/", # Homoglyph (0 instead of o)
    "https://www.google.com.login-account-security.info/update", # Deceptive subdomain
    "https://microsaft-secure.net/office365/login/", # Misspelling
    "https://wells-fargo-online.secure-access-point.com/", # Legitimate words, suspicious structure
    "https://facebook-login-page.io/index.html", # Looks plausible but is not the real domain
    "https://www.google.com",
    "http://bankofamerica-verification.co",
    "https://www.facebook.com",
    "https://www.amazon.com",
    "http://update-account-verification.com/login",
    "https://www.microsoft.com",
    "http://googIe-verification.com/login",  # note capital I instead of l
    "http://amazon-checkout-security.com",
    "http://facebook-login-alert.net",
    "https://www.netflix.com",
    "https://www.github.com",
    "https://www.paypal.com",
    "http://secure-login-paypal.com/login",
    "https://www.apple.com",
    "http://apple-secure-login.com",
    "http://microsoftsupport-security-alert.com",
    "https://www.wikipedia.org",
    "https://www.linkedin.com",
    "http://netflix-account-update.com",
    "http://linkedin-authenticate.com/login",
]

#test
print("\nTesting Final Model on Curated URLs")
for url in test_urls:
    phishing_prob = predict_url(url)
    decision = "Likely Phishing " if phishing_prob > 0.5 else "Likely Legitimate "

    print(f"URL: {url}")
    print(f"Phishing Probability: {phishing_prob:.2%}")
    print(f"Result: {decision}\n" + "-"*30)


Testing Final Model on Curated URLs
URL: https://www.unicef.org/
Phishing Probability: 31.55%
Result: Likely Legitimate 
------------------------------
URL: https://en.wikipedia.org/wiki/Machine_learning
Phishing Probability: 0.00%
Result: Likely Legitimate 
------------------------------
URL: https://github.com/pandas-dev/pandas
Phishing Probability: 0.50%
Result: Likely Legitimate 
------------------------------
URL: http://145.14.144.99/login.html
Phishing Probability: 100.00%
Result: Likely Phishing 
------------------------------
URL: http://my-bank-online.com@123.45.67.89/login
Phishing Probability: 91.00%
Result: Likely Phishing 
------------------------------
URL: https://update-account-verification-required.com/login
Phishing Probability: 100.00%
Result: Likely Phishing 
------------------------------
URL: http://secure-login-apple-id.com-input.info/private/index.html
Phishing Probability: 65.49%
Result: Likely Phishing 
------------------------------
URL: https://www.amaz0n-

In [None]:
import joblib
#file names
model_filename = 'phishing_url_detector_model.joblib'
features_filename = 'phishing_url_detector_features.joblib'

#model saved
joblib.dump(model, model_filename)
print(f"model saved successfully to '{model_filename}'")

#feature list saved
joblib.dump(final_feature_names, features_filename)
print(f"feature names saved successfully to '{features_filename}'")

model saved successfully to 'phishing_url_detector_model.joblib'
feature names saved successfully to 'phishing_url_detector_features.joblib'
