In [None]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def extract_features(sample):
    url = sample['url']
    html = sample['html_code']

    features = {
        "url_length": len(url),
        "has_ip": 1 if re.search(r"\d+\.\d+\.\d+\.\d+", url) else 0,
        "has_https": 1 if url.startswith("https://") else 0,
        "count_dots": url.count('.'),
        "count_at": url.count('@'),
        "has_iframe": 1 if "<iframe" in html.lower() else 0,
        "has_onclick": 1 if "onclick=" in html.lower() else 0,
        "has_form": 1 if "<form" in html.lower() else 0,
        "has_mailto": 1 if "mailto:" in html.lower() else 0,
        "script_count": html.lower().count("<script")
    }

    return list(features.values())


In [None]:
def load_dataset(path="phishing_dataset.csv"):
    if not os.path.exists(path):
        print("Creating sample phishing dataset...")

        samples = [
            {"url": "http://malicious.ru/login", "html_code": "<html><form action='steal'><input type='text'><iframe src='bad'></iframe></form>"},
            {"url": "https://google.com", "html_code": "<html><form action='/search'></form>"},
            {"url": "http://192.168.1.5/pay", "html_code": "<html><form><input type='password'></form><script>evil()</script>"},
            {"url": "https://secure.paypal.com", "html_code": "<html><form><input></form></html>"},
            {"url": "http://login.micr0soft-support.net", "html_code": "<html><iframe src='http://evil.com'></iframe><script>alert('hi')</script></html>"},
            {"url": "https://example.com/about", "html_code": "<html><h1>About Us</h1></html>"},
        ]

        labels = [1, 0, 1, 0, 1, 0]  # 1 = phishing, 0 = safe

        data = [extract_features(s) + [label] for s, label in zip(samples * 100, labels * 100)]
        columns = [
            "url_length", "has_ip", "has_https", "count_dots", "count_at",
            "has_iframe", "has_onclick", "has_form", "has_mailto", "script_count", "is_phishing"
        ]

        df = pd.DataFrame(data, columns=columns)
        df.to_csv(path, index=False)
    else:
        df = pd.read_csv(path)

    print(f"Dataset loaded: {df.shape}")
    return df

In [None]:
def preprocess(df):
    X = df.drop("is_phishing", axis=1)
    y = df["is_phishing"]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y, scaler


In [None]:
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=150)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Oranges")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    return clf

In [None]:
def save_model(model, scaler):
    os.makedirs("models", exist_ok=True)
    joblib.dump(model, "models/phishing_model.pkl")
    joblib.dump(scaler, "models/phishing_scaler.pkl")
    print("Model & scaler saved.")


In [None]:
if __name__ == "__main__":
    df = load_dataset()
    X, y, scaler = preprocess(df)
    model = train_model(X, y)
    save_model(model, scaler)