In [None]:
# Phishing URL Detection System
# Author: GitHub Copilot
# This notebook builds a phishing URL detection system using a CSV dataset of URLs.

import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import joblib

# 1. Load Dataset
DATA_PATH = 'Training/malicious_phish.csv'  # Update if needed
df = pd.read_csv(DATA_PATH)

# 2. Feature Engineering

def has_ip(url):
    # Check if URL contains an IP address
    ip_pattern = r"(\d{1,3}\.){3}\d{1,3}"
    return 1 if re.search(ip_pattern, url) else 0

def count_special_chars(url):
    return len(re.findall(r'[@\?\-=_%&/]', url))

def count_digits(url):
    return len(re.findall(r'\d', url))

def count_subdomains(url):
    try:
        hostname = urlparse(url).hostname
        if hostname:
            return hostname.count('.')
        else:
            return 0
    except:
        return 0

def domain_length(url):
    try:
        hostname = urlparse(url).hostname
        if hostname:
            return len(hostname)
        else:
            return 0
    except:
        return 0

def extract_features(df):
    df['url_length'] = df['url'].apply(len)
    df['num_dots'] = df['url'].apply(lambda x: x.count('.'))
    df['has_https'] = df['url'].apply(lambda x: 1 if x.startswith('https') else 0)
    df['num_special_chars'] = df['url'].apply(count_special_chars)
    df['num_digits'] = df['url'].apply(count_digits)
    df['num_subdomains'] = df['url'].apply(count_subdomains)
    df['domain_length'] = df['url'].apply(domain_length)
    df['has_ip'] = df['url'].apply(has_ip)
    return df

df = extract_features(df)

# Encode label
label_map = {'phishing': 1, 'legitimate': 0}
df['label'] = df['label'].map(label_map)

# 3. Preprocessing
# Drop rows with missing values
features = ['url_length', 'num_dots', 'has_https', 'num_special_chars', 'num_digits', 'num_subdomains', 'domain_length', 'has_ip']
df = df.dropna(subset=features + ['label'])

X = df[features]
y = df['label']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# 4. Modeling
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    results[name] = {
        'model': model,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'confusion_matrix': cm
    }
    print(f"\nModel: {name}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# 5. Model Selection & Saving
best_model_name = max(results, key=lambda k: results[k]['f1'])
best_model = results[best_model_name]['model']
print(f"\nBest Model: {best_model_name}")
joblib.dump(best_model, 'Training/phishing_url_model.joblib')
joblib.dump(scaler, 'Training/phishing_url_scaler.joblib')

# 6. Prediction Function
def extract_features_from_url(url):
    features = [
        len(url),
        url.count('.'),
        1 if url.startswith('https') else 0,
        count_special_chars(url),
        count_digits(url),
        count_subdomains(url),
        domain_length(url),
        has_ip(url)
    ]
    return np.array(features).reshape(1, -1)

def predict_url(url):
    model = joblib.load('Training/phishing_url_model.joblib')
    scaler = joblib.load('Training/phishing_url_scaler.joblib')
    features = extract_features_from_url(url)
    features_scaled = scaler.transform(features)
    pred = model.predict(features_scaled)[0]
    return 'phishing' if pred == 1 else 'legitimate'

# Example usage:
# print(predict_url('http://example.com/login?user=admin'))