<a href="https://colab.research.google.com/github/Dhanush-sai-reddy/ml-uci-phishing/blob/main/rfxgvtcf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:

import os
import re
import pickle
import warnings
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import kagglehub

warnings.filterwarnings("ignore")

# ============================
# 1. LOAD UCI DATASET
# ============================

def load_dataset():
    """Download and load the UCI phishing dataset from KaggleHub"""
    print("Downloading UCI dataset from KaggleHub...")
    path = kagglehub.dataset_download("isatish/phishing-dataset-uci-ml-csv")
    csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]

    if not csv_files:
        raise Exception("No CSV found in downloaded dataset folder.")

    df = pd.read_csv(os.path.join(path, csv_files[0]))
    print(f"Dataset Loaded: {df.shape}")
    print(f"Class distribution:\n{df['Result'].value_counts()}")
    print(f"-1: Legitimate, 1: Phishing")
    return df


# ============================
# 2. TRAIN MODEL
# ============================

def train_model(df):
    """Train an ensemble model on the dataset"""
    print("\n" + "="*50)
    print("TRAINING MODEL")
    print("="*50)

    # Prepare features and target
    X = df.drop(["Result", "id"], axis=1)  # Exclude 'id' column
    y = (df["Result"] == 1).astype(int)    # Convert {-1,1} ‚Üí {0,1} where 1=Phishing

    print(f"Features: {X.shape[1]}")
    print(f"Samples: {X.shape[0]}")
    print(f"Phishing samples: {y.sum()} ({y.sum()/len(y)*100:.1f}%)")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train Random Forest
    print("\nTraining RandomForest...")
    rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    rf_acc = accuracy_score(y_test, rf_pred)
    print(f"RandomForest Accuracy: {rf_acc:.4f}")

    # Train XGBoost
    print("\nTraining XGBoost...")
    xgb = XGBClassifier(
        n_estimators=250,
        max_depth=7,
        learning_rate=0.1,
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1
    )
    xgb.fit(X_train, y_train)
    xgb_pred = xgb.predict(X_test)
    xgb_acc = accuracy_score(y_test, xgb_pred)
    print(f"XGBoost Accuracy: {xgb_acc:.4f}")

    # Train Ensemble (Voting Classifier)
    print("\nTraining Ensemble Model...")
    voting = VotingClassifier(
        estimators=[("rf", rf), ("xgb", xgb)],
        voting="soft"
    )
    voting.fit(X_train, y_train)
    voting_pred = voting.predict(X_test)
    voting_acc = accuracy_score(y_test, voting_pred)
    print(f"Ensemble Accuracy: {voting_acc:.4f}")

    # Detailed report
    print("\n" + "="*50)
    print("ENSEMBLE MODEL PERFORMANCE")
    print("="*50)
    print(classification_report(y_test, voting_pred,
                               target_names=["Legitimate", "Phishing"]))

    return voting, X.columns.tolist()


# ============================
# 3. SAVE/LOAD MODEL
# ============================

def save_model(model, filename="phishing_model.pkl"):
    """Save trained model to disk"""
    with open(filename, "wb") as f:
        pickle.dump(model, f)
    print(f"\nModel saved as: {filename}")
    return filename

def load_saved_model(filename="phishing_model.pkl"):
    """Load saved model from disk"""
    with open(filename, "rb") as f:
        model = pickle.load(f)
    print(f"Model loaded from: {filename}")
    return model


# ============================
# 4. FEATURE EXTRACTION (URL + HTML/JS)
# ============================

UCI_FEATURES = [
    "having_IP_Address", "URL_Length", "Shortining_Service", "having_At_Symbol",
    "double_slash_redirecting", "Prefix_Suffix", "having_Sub_Domain",
    "SSLfinal_State", "Domain_registeration_length", "Favicon", "port",
    "HTTPS_token", "Request_URL", "URL_of_Anchor", "Links_in_tags", "SFH",
    "Submitting_to_email", "Abnormal_URL", "Redirect", "on_mouseover",
    "RightClick", "popUpWidnow", "Iframe", "age_of_domain", "DNSRecord",
    "web_traffic", "Page_Rank", "Google_Index", "Links_pointing_to_page",
    "Statistical_report"
]

def extract_features(url):
    """
    Extract 30 features from a URL based on UCI phishing dataset specification
    Returns: Dictionary of feature names and values {-1, 0, 1}
    """
    features = {}

    # Parse URL
    p = urlparse(url)
    domain = p.netloc
    dom = domain.replace("www.", "").lower()

    # ========== URL-BASED FEATURES ==========

    # 1. Having IP Address
    ip_pattern = r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$'
    features["having_IP_Address"] = 1 if re.match(ip_pattern, domain) else -1

    # 2. URL Length
    url_length = len(url)
    if url_length < 54:
        features["URL_Length"] = -1  # Legitimate
    elif 54 <= url_length <= 75:
        features["URL_Length"] = 0   # Suspicious
    else:
        features["URL_Length"] = 1   # Phishing

    # 3. Shortening Service
    shorteners = r'(bit\.ly|goo\.gl|tinyurl|is\.gd|t\.co|ow\.ly|buff\.ly|adf\.ly|bitly\.com|shorte\.st)'
    features["Shortining_Service"] = 1 if re.search(shorteners, url, re.IGNORECASE) else -1

    # 4. Having @ Symbol
    features["having_At_Symbol"] = 1 if "@" in url else -1

    # 5. Double Slash Redirecting
    features["double_slash_redirecting"] = 1 if url.count("//") > 1 else -1

    # 6. Prefix/Suffix
    features["Prefix_Suffix"] = 1 if "-" in domain else -1

    # 7. Having Sub Domain
    subdomains = domain.count(".")
    if subdomains == 1:
        features["having_Sub_Domain"] = -1  # Legitimate (e.g., example.com)
    elif subdomains == 2:
        features["having_Sub_Domain"] = 0   # Suspicious (e.g., www.example.com)
    else:
        features["having_Sub_Domain"] = 1   # Phishing (e.g., www.sub.example.com)

    # 8. SSL Final State
    features["SSLfinal_State"] = 1 if p.scheme == "https" else -1

    # 9. HTTPS Token in Domain
    features["HTTPS_token"] = 1 if "https" in domain.lower() else -1

    # 10. Port
    features["port"] = 1 if p.port not in [80, 443, None] else -1

    # ========== PLACEHOLDER FEATURES (Can't verify from URL alone) ==========
    # These would require external APIs/databases
    placeholder_features = [
        "Domain_registeration_length", "Favicon", "age_of_domain", "DNSRecord",
        "web_traffic", "Page_Rank", "Google_Index", "Links_pointing_to_page",
        "Statistical_report"
    ]
    for pf in placeholder_features:
        features[pf] = -1  # Default to legitimate (can't verify)

    # ========== HTML-BASED FEATURES ==========
    html_features = [
        "Request_URL", "URL_of_Anchor", "Links_in_tags", "SFH",
        "Submitting_to_email", "Abnormal_URL", "Redirect", "on_mouseover",
        "RightClick", "popUpWidnow", "Iframe"
    ]

    # Initialize HTML features to neutral (0) - will update if HTML is fetched
    for hf in html_features:
        features[hf] = 0

    try:
        # Try to fetch HTML with timeout
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, timeout=5, headers=headers, verify=False)
        response.raise_for_status()
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')

        # Extract HTML elements
        anchors = soup.find_all('a', href=True)
        imgs = soup.find_all('img')
        scripts = soup.find_all('script')
        links = soup.find_all('link')
        iframes = soup.find_all('iframe')
        forms = soup.find_all('form')

        # 11. Request URL (external objects)
        total_objects = len(imgs) + len(scripts)
        if total_objects > 0:
            external_objects = 0
            for tag in imgs + scripts:
                src = tag.get('src', '')
                if src and src.startswith('http') and dom not in src:
                    external_objects += 1
            external_ratio = external_objects / total_objects
            if external_ratio < 0.22:
                features["Request_URL"] = -1
            elif external_ratio <= 0.61:
                features["Request_URL"] = 0
            else:
                features["Request_URL"] = 1
        else:
            features["Request_URL"] = 0

        # 12. URL of Anchor
        if anchors:
            external_anchors = 0
            for a in anchors:
                href = a.get('href', '')
                if href and href.startswith('http') and dom not in href:
                    external_anchors += 1
            external_ratio = external_anchors / len(anchors)
            if external_ratio < 0.31:
                features["URL_of_Anchor"] = -1
            elif external_ratio <= 0.67:
                features["URL_of_Anchor"] = 0
            else:
                features["URL_of_Anchor"] = 1
        else:
            features["URL_of_Anchor"] = 0

        # 13. Links in Tags (Meta, Script, Link tags)
        meta_tags = soup.find_all('meta')
        all_tags = scripts + links + meta_tags
        if all_tags:
            external_links = 0
            for tag in all_tags:
                src = tag.get('src', '') or tag.get('href', '') or tag.get('content', '')
                if src and src.startswith('http') and dom not in src:
                    external_links += 1
            external_ratio = external_links / len(all_tags)
            if external_ratio < 0.17:
                features["Links_in_tags"] = -1
            elif external_ratio <= 0.81:
                features["Links_in_tags"] = 0
            else:
                features["Links_in_tags"] = 1
        else:
            features["Links_in_tags"] = 0

        # 14. SFH (Server Form Handler)
        if not forms:
            features["SFH"] = -1
        else:
            form_action = forms[0].get('action', '').lower()
            if not form_action or form_action == 'about:blank':
                features["SFH"] = 1  # Phishing
            elif form_action.startswith('http') and dom not in form_action:
                features["SFH"] = 0  # Suspicious
            else:
                features["SFH"] = -1  # Legitimate

        # 15. Submitting to Email
        features["Submitting_to_email"] = 1 if 'mailto:' in html.lower() else -1

        # 16. Abnormal URL (hostname in page content)
        features["Abnormal_URL"] = 1 if dom not in html.lower() else -1

        # 17. Redirect
        redirect_patterns = [
            'window.location', 'window.open', 'location.href',
            'location.replace', 'http-equiv="refresh"'
        ]
        has_redirect = any(pattern in html.lower() for pattern in redirect_patterns)
        features["Redirect"] = 1 if has_redirect else -1

        # 18. On Mouse Over
        features["on_mouseover"] = 1 if 'onmouseover' in html.lower() else -1

        # 19. Right Click Disabled
        features["RightClick"] = 1 if 'event.button==2' in html or 'contextmenu' in html.lower() else -1

        # 20. Popup Window
        features["popUpWidnow"] = 1 if 'window.open' in html or 'alert(' in html else -1

        # 21. IFrame
        features["Iframe"] = 1 if iframes else -1

    except Exception as e:
        # HTML fetch failed - keep default values (0 for HTML features)
        pass

    # Ensure all features are present
    for feature in UCI_FEATURES:
        if feature not in features:
            features[feature] = 0

    return features


# ============================
# 5. PREDICT WITH RULE-BASED OVERRIDES
# ============================

def predict_url(url, model, feature_cols, use_rules=True):
    """
    Predict if URL is phishing with optional rule-based overrides
    Returns: (prediction, probability, features_dict)
    """
    # Extract features
    features = extract_features(url)

    # Create dataframe for model prediction
    df = pd.DataFrame([features])[feature_cols]

    # Get model prediction
    prob = model.predict_proba(df)[0][1]  # Probability of being phishing
    pred = model.predict(df)[0]  # 1=Phishing, 0=Legitimate

    # ========== RULE-BASED OVERRIDES ==========
    if use_rules:
        # Rule 1: IP Address in URL is highly suspicious
        if features["having_IP_Address"] == 1:
            pred = 1  # Force phishing
            prob = max(prob, 0.9)  # High confidence

        # Rule 2: Shortened URLs are suspicious
        elif features["Shortining_Service"] == 1:
            pred = 1  # Force phishing
            prob = max(prob, 0.8)

        # Rule 3: Very long URLs are suspicious
        elif features["URL_Length"] == 1:
            pred = 1
            prob = max(prob, 0.7)

        # Rule 4: Multiple strong legitimate indicators
        legit_indicators = sum(1 for k, v in features.items() if v == -1)
        phishing_indicators = sum(1 for k, v in features.items() if v == 1)

        if legit_indicators >= 20 and phishing_indicators <= 2:
            pred = 0  # Force legitimate
            prob = min(prob, 0.1)

    return pred, prob, features


# ============================
# 6. DEBUG & ANALYSIS TOOLS
# ============================

def analyze_features(features):
    """Analyze and display feature breakdown"""
    print("\n" + "="*60)
    print("FEATURE ANALYSIS")
    print("="*60)

    phishing_features = [k for k, v in features.items() if v == 1]
    suspicious_features = [k for k, v in features.items() if v == 0]
    legitimate_features = [k for k, v in features.items() if v == -1]

    print(f"\nPhishing Indicators ({len(phishing_features)}):")
    print(", ".join(phishing_features) if phishing_features else "None")

    print(f"\nSuspicious/Neutral ({len(suspicious_features)}):")
    print(", ".join(suspicious_features[:10]) + ("..." if len(suspicious_features) > 10 else ""))

    print(f"\nLegitimate Indicators ({len(legitimate_features)}):")
    print(", ".join(legitimate_features[:10]) + ("..." if len(legitimate_features) > 10 else ""))

    return len(phishing_features), len(suspicious_features), len(legitimate_features)


def test_urls(urls, model, feature_cols):
    """Test multiple URLs and display results"""
    print("\n" + "="*60)
    print("URL TESTING RESULTS")
    print("="*60)

    results = []
    for url in urls:
        pred, prob, features = predict_url(url, model, feature_cols)

        print(f"\n{'='*40}")
        print(f"URL: {url}")
        print(f"{'='*40}")
        print(f"PREDICTION: {'üö® PHISHING' if pred == 1 else '‚úÖ LEGITIMATE'}")
        print(f"CONFIDENCE: {prob:.1%}")

        # Show top features
        phishing, suspicious, legitimate = analyze_features(features)

        # Store results
        results.append({
            'url': url,
            'prediction': 'Phishing' if pred == 1 else 'Legitimate',
            'confidence': prob,
            'phishing_indicators': phishing,
            'legitimate_indicators': legitimate
        })

    return results


# ============================
# 7. MAIN EXECUTION
# ============================

if __name__ == "__main__":
    print("="*60)
    print("PHISHING URL DETECTOR")
    print("="*60)

    # Option 1: Train new model
    train_new = input("\nTrain new model? (y/n): ").lower().strip() == 'y'

    if train_new:
        # Load dataset and train
        df = load_dataset()
        model, feature_cols = train_model(df)

        # Save model
        model_file = save_model(model, "phishing_detector_model.pkl")

        # Save feature columns for later use
        with open("feature_columns.pkl", "wb") as f:
            pickle.dump(feature_cols, f)
        print("Feature columns saved.")

    else:
        # Load existing model
        try:
            model = load_saved_model("phishing_detector_model.pkl")
            with open("feature_columns.pkl", "rb") as f:
                feature_cols = pickle.load(f)
            print(f"Loaded {len(feature_cols)} features")
        except FileNotFoundError:
            print("No saved model found. Training new model...")
            df = load_dataset()
            model, feature_cols = train_model(df)
            save_model(model, "phishing_detector_model.pkl")

    # Test with example URLs
    test_urls_list = [
        "https://google.com",
        "http://198.54.23.11/login/update",  # IP address URL
        "https://paypal-security-alert.com/verify",  # Phishing-like
        "http://bit.ly/2fSdq",  # Shortened URL
        "https://github.com",
        "http://free-gift-cards-now.com/claim",  # Likely phishing
        "https://www.amazon.com",
        "http://192.168.1.100:8080/admin",  # Local IP
        "https://www.paypal.com.us.security.verify-account.com",  # Suspicious
        "https://www.wikipedia.org"
    ]

    # Test the URLs
    results = test_urls(test_urls_list, model, feature_cols)

    # Summary
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    phishing_count = sum(1 for r in results if r['prediction'] == 'Phishing')
    print(f"\nTotal URLs tested: {len(results)}")
    print(f"Phishing detected: {phishing_count}")
    print(f"Legitimate: {len(results) - phishing_count}")

    # Interactive mode
    while True:
        print("\n" + "-"*40)
        user_url = input("\nEnter URL to check (or 'quit' to exit): ").strip()

        if user_url.lower() in ['quit', 'exit', 'q']:
            print("Exiting...")
            break

        if not user_url.startswith(('http://', 'https://')):
            user_url = 'http://' + user_url

        try:
            pred, prob, features = predict_url(user_url, model, feature_cols)

            print(f"\n{'='*50}")
            print(f"RESULT: {'üö® PHISHING' if pred == 1 else '‚úÖ LEGITIMATE'}")
            print(f"Confidence: {prob:.1%}")
            print(f"{'='*50}")

            # Show key indicators
            print("\nKEY INDICATORS:")
            if features["having_IP_Address"] == 1:
                print("  ‚ö†Ô∏è  IP Address in URL")
            if features["Shortining_Service"] == 1:
                print("  ‚ö†Ô∏è  URL Shortening Service")
            if features["SSLfinal_State"] == -1:
                print("  ‚ö†Ô∏è  No HTTPS (HTTP only)")
            if features["having_At_Symbol"] == 1:
                print("  ‚ö†Ô∏è  @ Symbol in URL")

            # Ask for detailed analysis
            if input("\nShow detailed analysis? (y/n): ").lower() == 'y':
                analyze_features(features)

        except Exception as e:
            print(f"Error analyzing URL: {e}")

PHISHING URL DETECTOR

Train new model? (y/n): y
Downloading UCI dataset from KaggleHub...
Using Colab cache for faster access to the 'phishing-dataset-uci-ml-csv' dataset.
Dataset Loaded: (11055, 32)
Class distribution:
Result
 1    6157
-1    4898
Name: count, dtype: int64
-1: Legitimate, 1: Phishing

TRAINING MODEL
Features: 30
Samples: 11055
Phishing samples: 6157 (55.7%)

Training RandomForest...
RandomForest Accuracy: 0.9742

Training XGBoost...
XGBoost Accuracy: 0.9751

Training Ensemble Model...
Ensemble Accuracy: 0.9774

ENSEMBLE MODEL PERFORMANCE
              precision    recall  f1-score   support

  Legitimate       0.98      0.97      0.97       980
    Phishing       0.97      0.99      0.98      1231

    accuracy                           0.98      2211
   macro avg       0.98      0.98      0.98      2211
weighted avg       0.98      0.98      0.98      2211


Model saved as: phishing_detector_model.pkl
Feature columns saved.

URL TESTING RESULTS

URL: https://google.

KeyboardInterrupt: Interrupted by user

In [7]:
df.head()

Unnamed: 0,id,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,2,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [6]:

df[(df["having_IP_Address"] == True) & (df['Result'] == -1)]

Unnamed: 0,id,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
1,2,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
6,7,1,0,-1,1,1,-1,-1,-1,1,...,1,1,1,-1,-1,-1,1,0,-1,-1
7,8,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,0,-1,1,0,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11022,11023,1,-1,1,-1,1,-1,-1,-1,-1,...,-1,-1,1,1,1,-1,1,0,1,-1
11029,11030,1,-1,1,1,1,-1,1,-1,-1,...,1,1,1,1,1,-1,1,0,1,-1
11037,11038,1,-1,-1,1,-1,-1,-1,-1,-1,...,1,1,1,-1,0,-1,1,0,1,-1
11043,11044,1,-1,1,1,1,-1,-1,0,-1,...,1,1,1,1,0,-1,1,0,1,-1
