In [None]:
import pandas as pd
import numpy as np
import re
import joblib
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def extract_features(url):
    features = {
        "url_length": len(url),
        "has_ip": 1 if re.search(r'http[s]?://\d+\.\d+\.\d+\.\d+', url) else 0,
        "count_dots": url.count('.'),
        "count_hyphens": url.count('-'),
        "count_at": url.count('@'),
        "count_digits": sum(c.isdigit() for c in url),
        "has_https": 1 if url.startswith("https://") else 0,
        "count_special": len(re.findall(r'[^\w\s]', url)),
    }
    return list(features.values())


In [None]:
def load_dataset(path="malicious_urls.csv"):
    if not os.path.exists(path):
        print("No dataset found. Generating sample data...")

        urls = [
            "http://192.168.0.1/malware.exe",
            "https://secure-login.com/login",
            "http://phishingsite.ru/update",
            "https://accounts.google.com/session",
            "http://randomsite.xyz/?cmd=inject",
            "https://example.com",
            "http://bad-domain.net/index.php?cmd=exec",
            "https://mybank.com/reset"
        ]
        labels = [1, 0, 1, 0, 1, 0, 1, 0]  # 1 = malicious, 0 = safe

        data = [extract_features(url) + [label] for url, label in zip(urls * 125, labels * 125)]
        columns = [
            "url_length", "has_ip", "count_dots", "count_hyphens",
            "count_at", "count_digits", "has_https", "count_special", "is_malicious"
        ]
        df = pd.DataFrame(data, columns=columns)
        df.to_csv(path, index=False)
    else:
        df = pd.read_csv(path)

    print(f"Loaded dataset: {df.shape}")
    return df

In [None]:
def preprocess(df):
    X = df.drop("is_malicious", axis=1)
    y = df["is_malicious"]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y, scaler


In [None]:
def train_model(X_train, y_train):
    model = GradientBoostingClassifier()
    model.fit(X_train, y_train)
    return model

def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Reds")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()



