In [None]:
# ===============================
# Imports for Voting Ensemble Training
# ===============================

import pandas as pd
import numpy as np
import joblib
import re
import math
from urllib.parse import urlparse, parse_qs

# Machine learning
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report


In [None]:
import pandas as pd

df = pd.read_csv(r"D:\QRusaderTrainedModel\zzznewTrainingModel\csvFiles\merged_url_datasets.csv")
df.rename(columns={"Label":"label"}, inplace=True)


In [None]:
df.info()

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
count = df.label.value_counts()
count

In [None]:
# ===============================
# Optimized Parallel Feature Extraction (Refined Features)
# ===============================

from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from pathlib import Path
from urllib.parse import urlparse, urlunparse
import shelve
import math
import re

# -------------------------------
# Load dataset
# -------------------------------
df = pd.read_csv(r"D:\QRusaderTrainedModel\zzznewTrainingModel\csvFiles\merged_url_datasets.csv")
df.rename(columns={"Label": "label", "URL": "url"}, inplace=True)

# -------------------------------
# Helper functions
# -------------------------------
def normalize_url(url: str) -> str:
    if not url.startswith(("http://", "https://")):
        url = "http://" + url
    parsed = urlparse(url)
    domain = parsed.netloc.lower()
    if domain.startswith("www."):
        domain = domain[4:]
    path = parsed.path or "/"
    return urlunparse((parsed.scheme.lower(), domain, path, parsed.params, parsed.query, parsed.fragment))

def shannon_entropy(data):
    if not data:
        return 0
    prob = [float(data.count(c))/len(data) for c in set(data)]
    return -sum(p*math.log2(p) for p in prob)

shortening_services = ["bit.ly","tinyurl","goo.gl","t.co","ow.ly","shorte.st","cutt.ly"]
suspicious_keywords = ["secure","account","login","update","free","bonus","ebayisapi",
                       "banking","confirm","signin","verification"]

def is_shortened(url): return int(any(s in url for s in shortening_services))
def has_ip(url): return int(bool(re.match(r"http[s]?://\d+\.\d+\.\d+\.\d+", url)))
def contains_suspicious_word(url): return sum(word in url.lower() for word in suspicious_keywords)

# -------------------------------
# WHOIS safe wrapper with caching
# -------------------------------
CACHE_FILE = "whois_cache.db"
def get_whois_safe(domain):
    with shelve.open(CACHE_FILE) as cache:
        if domain in cache:
            return cache[domain]["has_whois"], cache[domain]["domain_age_days"]
        try:
            import whois
            w = whois.whois(domain)
            if not hasattr(w, "creation_date") or not w.creation_date:
                has_dns, age_days = 0, 0
            else:
                has_dns = 1
                creation_date = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
                age_days = (pd.Timestamp.now() - pd.Timestamp(creation_date)).days
        except Exception:
            has_dns, age_days = 0, 0
        cache[domain] = {"has_whois": has_dns, "domain_age_days": age_days}
        return has_dns, age_days

# -------------------------------
# Feature extraction
# -------------------------------
def extract_features(url):
    url_norm = normalize_url(url)
    parsed = urlparse(url_norm)
    domain = parsed.netloc
    path = parsed.path or "/"

    # WHOIS features
    has_whois, domain_age_days = get_whois_safe(domain)

    # URL structure
    total_special_char = sum(url_norm.count(c) for c in ['@','?','-','=','.','!','#','$','&','~','*','%','+','^','_'])
    path_tokens = [t for t in path.split('/') if t]

    # Entropy
    domain_entropy = shannon_entropy(domain)
    path_entropy = shannon_entropy(path)

    # N-grams
    bigrams = ["_".join(path_tokens[i:i+2]) for i in range(len(path_tokens)-1)]
    trigrams = ["_".join(path_tokens[i:i+3]) for i in range(len(path_tokens)-2)]

    return {
        "url_original": url,
        "normalized_url": url_norm,
        "url_length": len(url_norm),
        "Shortining_Service": is_shortened(url_norm),
        "having_ip_address": has_ip(url_norm),
        "subdomain_count": max(domain.count(".")-1,0),
        "subdomain_ratio": max(domain.count(".")-1,0)/max(1,len(domain)),
        "path_depth": path.count('/'),
        "path_length": len(path),
        "param_count": parsed.query.count("="),
        "digit_letter_ratio": sum(c.isdigit() for c in url_norm)/max(1,sum(c.isalpha() for c in url_norm)),
        "domain_entropy": domain_entropy,
        "path_entropy": path_entropy,
        "total_special_char": total_special_char,
        "special_char_ratio": total_special_char/max(1,len(url_norm)),
        "risky_tld": int(domain.split('.')[-1] in ["zip","xyz","top","club","info"]),
        "tld_length": len(domain.split('.')[-1]),
        "suspicious_word_count": contains_suspicious_word(url_norm),
        "url_upper_ratio": sum(1 for c in url_norm if c.isupper())/max(1,len(url_norm)),
        "repeated_char_count": sum(url_norm.count(c*2) for c in set(url_norm)),
        "path_token_count": len(path_tokens),
        "unique_bigrams": len(set(bigrams)),
        "unique_trigrams": len(set(trigrams)),
        "has_whois": has_whois,
        "domain_age_days": domain_age_days
    }


In [None]:
# ===============================
# 4️⃣ Parallel Feature Extraction (Optimized with WHOIS caching & refined features)
# ===============================
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from pathlib import Path
from urllib.parse import urlparse
import shelve

# -------------------------------
# Load dataset
# -------------------------------
df = pd.read_csv(r"D:\QRusaderTrainedModel\zzznewTrainingModel\csvFiles\merged_url_datasets.csv")
df.rename(columns={"Label": "label", "URL": "url"}, inplace=True)

# -------------------------------
# Pre-cache WHOIS results (so each domain is queried once)
# -------------------------------
domains = df["url"].apply(lambda u: urlparse(normalize_url(u)).netloc).unique()

print(f"🔍 Pre-caching WHOIS for {len(domains)} unique domains...")
with shelve.open("whois_cache.db") as cache:
    for i, domain in enumerate(domains, 1):
        if domain not in cache:  # only fetch new ones
            has_whois, age_days = get_whois_safe(domain)
            cache[domain] = {"has_whois": has_whois, "domain_age_days": age_days}
        if i % 100 == 0 or i == len(domains):
            print(f"Cached {i}/{len(domains)} domains")

# -------------------------------
# Safe wrapper for feature extraction
# -------------------------------
def safe_extract(url):
    try:
        feats = extract_features(url)
        # Ensure numeric fields are filled
        for key in ["has_whois", "domain_age_days", "unique_bigrams", "unique_trigrams"]:
            if feats.get(key) is None:
                feats[key] = 0
        return feats
    except Exception as e:
        print(f"⚠️ Error processing URL {url}: {e}")
        return None

# -------------------------------
# Parallel feature extraction
# -------------------------------
features = []
with ThreadPoolExecutor(max_workers=20) as executor:
    future_to_url = {executor.submit(safe_extract, url): url for url in df["url"]}

    for i, future in enumerate(as_completed(future_to_url), 1):
        result = future.result()
        if result is not None:
            features.append(result)
        if i % 500 == 0 or i == len(df):
            print(f"✅ Processed {i}/{len(df)} URLs")

# -------------------------------
# Convert to DataFrame
# -------------------------------
features_df = pd.DataFrame(features)
print(f"✅ Feature extraction completed. Shape: {features_df.shape}")

# -------------------------------
# Merge labels
# -------------------------------
features_df = features_df.merge(
    df[['url','label']], left_on='url_original', right_on='url', how='left'
)
features_df.drop(columns=['url'], inplace=True)

# -------------------------------
# Ensure numeric columns are numeric
# -------------------------------
numeric_cols = [
    "url_length", "Shortining_Service", "having_ip_address",
    "subdomain_count", "subdomain_ratio", "path_depth", "path_length",
    "param_count", "digit_letter_ratio",
    "domain_entropy", "path_entropy",
    "total_special_char", "special_char_ratio",
    "risky_tld", "tld_length", "suspicious_word_count",
    "url_upper_ratio", "repeated_char_count", "path_token_count",
    "unique_bigrams", "unique_trigrams",
    "has_whois", "domain_age_days"
]

features_df[numeric_cols] = features_df[numeric_cols].apply(
    pd.to_numeric, errors='coerce'
).fillna(0)

# -------------------------------
# Save to CSV (fast)
# -------------------------------
output_file = Path(r"D:\QRusaderTrainedModel\zzznewTrainingModel\csvFiles\url_features_detailed.csv")
output_file.parent.mkdir(parents=True, exist_ok=True)

features_df.to_csv(output_file, index=False, encoding="utf-8")
print(f"✅ Features with labels and refined enhancements saved at: {output_file}")


In [None]:
# Count how many have WHOIS
total_urls = len(features_df)
with_whois = features_df['has_whois'].sum()  # sum of 1's
without_whois = total_urls - with_whois

print(f"Total URLs: {total_urls}")
print(f"With WHOIS: {with_whois} ({with_whois/total_urls*100:.2f}%)")
print(f"Without WHOIS: {without_whois} ({without_whois/total_urls*100:.2f}%)")

In [None]:
# ===============================
# 5️⃣ Verify specific URL presence
# ===============================

url_to_check = "https://www.google.com"
normalized_url = normalize_url(url_to_check)

# Search in the features_df
google_row = features_df[features_df['normalized_url'] == normalized_url]
# Show all columns for this row
pd.set_option('display.max_columns', None)  # show all columns
pd.set_option('display.width', 200)         # set display width to avoid wrapping
if not google_row.empty:
    print("✅ Found URL in dataset:")
    print(google_row)
else:
    print("❌ URL not found in dataset.")


In [None]:
# ===============================
# Step 5: Exploratory Data Analysis (Reduced + Normalized Features)
# ===============================

import matplotlib.pyplot as plt
import seaborn as sns
import math

# Ensure features_df exists
if 'features_df' not in globals():
    raise ValueError("features_df not defined. Run extract_features on your dataset first.")

# -------------------------------
# 1️⃣ Numeric Features - Histograms
# -------------------------------
numeric_features = features_df.select_dtypes(include=['float64', 'int64'])

# Exclude non-informative numeric columns
exclude_numeric = ['has_whois']  # you can keep domain_age_days for analysis
numeric_features = numeric_features.drop(columns=[col for col in exclude_numeric if col in numeric_features.columns], errors='ignore')

num_cols = len(numeric_features.columns)
num_rows = math.ceil(num_cols / 3)  # 3 plots per row

fig, axes = plt.subplots(num_rows, 3, figsize=(18, num_rows*4))
axes = axes.flatten()

for i, col in enumerate(numeric_features.columns):
    sns.histplot(numeric_features[col], bins=30, kde=False, ax=axes[i], color='skyblue')
    axes[i].set_title(f"Distribution of {col}", fontsize=12)
    axes[i].set_xlabel(col, fontsize=10)
    axes[i].set_ylabel("Count", fontsize=10)

# Hide empty subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# -------------------------------
# 2️⃣ Binary / Categorical Features - Countplots
# -------------------------------
binary_features = [
    "Shortining_Service",
    "having_ip_address",
    "suspicious_word_count",
    "risky_tld",
    "has_whois"
]

binary_features_existing = [col for col in binary_features if col in features_df.columns]

for col in binary_features_existing:
    plt.figure(figsize=(6,4))
    sns.countplot(x=col, data=features_df, palette='Set2')
    plt.title(f"Countplot of {col}", fontsize=12)
    plt.xlabel(col, fontsize=10)
    plt.ylabel("Count", fontsize=10)
    plt.show()

# -------------------------------
# 3️⃣ Correlation Heatmap - Numeric Features Only
# -------------------------------
plt.figure(figsize=(18,16))
sns.heatmap(numeric_features.corr(), annot=True, fmt=".2f", cmap="magma", cbar=True)
plt.title("Correlation Heatmap of Numeric Features", fontsize=16)
plt.show()



In [None]:
# ===============================
# Training & Evaluation: Random Forest (Refined Features)
# ===============================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score, classification_report, roc_curve, auc,
    confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# -------------------------------
# 1️⃣ Load dataset
# -------------------------------
df = pd.read_csv(r"D:\QRusaderTrainedModel\zzznewTrainingModel\csvFiles\url_features_detailed.csv")
df = df.dropna(subset=['label'])

# -------------------------------
# 2️⃣ Define numeric features
# -------------------------------
numeric_cols = [
    "url_length", "Shortining_Service", "having_ip_address",
    "subdomain_count", "subdomain_ratio", "path_depth", "path_length",
    "param_count", "digit_letter_ratio",
    "domain_entropy", "path_entropy",
    "total_special_char", "special_char_ratio",
    "risky_tld", "tld_length", "suspicious_word_count",
    "url_upper_ratio", "repeated_char_count", "path_token_count",
    "unique_bigrams", "unique_trigrams",
    "has_whois", "domain_age_days"
]

X = df[numeric_cols].fillna(0)
y = df['label']

# Label encoding
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_bin = label_binarize(y_encoded, classes=range(len(le.classes_)))

# -------------------------------
# 3️⃣ Train/test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
y_test_bin = label_binarize(y_test, classes=range(len(le.classes_)))

# -------------------------------
# 4️⃣ Initialize Random Forest
# -------------------------------
rf = RandomForestClassifier(
    n_estimators=1500,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='log2',
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

# -------------------------------
# 5️⃣ Train & Evaluate
# -------------------------------
print("\n" + "="*60)
print(f"### Training & Evaluating: Random Forest")

# Fit model
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc*100:.2f}%")

# Decode labels for reporting
y_test_labels = le.inverse_transform(y_test)
y_pred_labels = le.inverse_transform(y_pred)

# Classification report
print(classification_report(y_test_labels, y_pred_labels))

# Confusion Matrix
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=le.classes_)
disp = ConfusionMatrixDisplay(cm, display_labels=le.classes_)
disp.plot(cmap="Blues", values_format="d")
plt.title(f"Confusion Matrix: Random Forest")
plt.show()

# -------------------------------
# 6️⃣ ROC Curve
# -------------------------------
y_score = rf.predict_proba(X_test)
fpr, tpr, roc_auc = {}, {}, {}

if len(le.classes_) == 2:
    fpr[1], tpr[1], _ = roc_curve(y_test, y_score[:, 1])
    roc_auc[1] = auc(fpr[1], tpr[1])
    fpr["micro"], tpr["micro"], roc_auc["micro"] = fpr[1], tpr[1], roc_auc[1]
else:
    for i in range(len(le.classes_)):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(le.classes_))]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(len(le.classes_)):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= len(le.classes_)
    fpr["macro"], tpr["macro"] = all_fpr, mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

plt.figure(figsize=(8,6))
plt.plot(fpr["micro"], tpr["micro"], label=f"Random Forest (AUC={roc_auc['micro']:.2f})")
plt.plot([0,1],[0,1],'k--', label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve: Random Forest")
plt.legend()
plt.show()

# -------------------------------
# 7️⃣ Feature Importance
# -------------------------------
feat_imp_df = pd.DataFrame({
    "feature": X.columns,
    "importance": rf.feature_importances_
}).sort_values(by="importance", ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x="importance", y="feature", data=feat_imp_df.head(15), palette="viridis")
plt.title("Top 15 Feature Importances: Random Forest")
plt.show()


In [None]:
# ===============================
# Step 6: Check 10 Random URLs
# ===============================
import random

# Sample 10 random rows from the dataset
sample_df = features_df.sample(n=10, random_state=50)  # fix random_state for reproducibility

# Select relevant columns to display
display_cols = ['url_original', 'label', 'predicted_label', 'predicted_prob_malicious']

print("🔹 Random 10 URL Predictions:")
print(sample_df[display_cols].reset_index(drop=True))


In [None]:
# ===============================
# Save Random Forest Model & Metadata
# ===============================

import joblib
import os

# Directory to save the model
model_dir = r"D:\QRusaderTrainedModel\zzznewTrainingModel\saved_models"
os.makedirs(model_dir, exist_ok=True)

# Save the Random Forest model
rf_model_path = os.path.join(model_dir, "random_forest_model.pkl")
joblib.dump(rf, rf_model_path)

# Save the LabelEncoder
le_path = os.path.join(model_dir, "label_encoder.pkl")
joblib.dump(le, le_path)

# Save the feature columns list (numeric columns used in training)
features_path = os.path.join(model_dir, "feature_columns.pkl")
joblib.dump(numeric_cols, features_path)

print(f"✅ Random Forest model, encoder, and feature columns saved to {model_dir}")


In [None]:
# ===============================
# Predict Safe URLs Using Saved Random Forest
# ===============================

import pandas as pd
import joblib
from urllib.parse import urlparse, urlunparse
import math
import re

# -------------------------------
# 1️⃣ Load saved model & metadata
# -------------------------------
model_dir = r"D:\QRusaderTrainedModel\zzznewTrainingModel\saved_models"

rf = joblib.load(f"{model_dir}/random_forest_model.pkl")
le = joblib.load(f"{model_dir}/label_encoder.pkl")
feature_columns = joblib.load(f"{model_dir}/feature_columns.pkl")

# -------------------------------
# 2️⃣ Helper functions (same as in feature extraction)
# -------------------------------
def normalize_url(url: str) -> str:
    if not url.startswith(("http://", "https://")):
        url = "http://" + url
    parsed = urlparse(url)
    domain = parsed.netloc.lower()
    if domain.startswith("www."):
        domain = domain[4:]
    path = parsed.path or "/"
    return urlunparse((parsed.scheme.lower(), domain, path, parsed.params, parsed.query, parsed.fragment))

def shannon_entropy(data):
    if not data:
        return 0
    prob = [float(data.count(c))/len(data) for c in set(data)]
    return -sum(p*math.log2(p) for p in prob)

shortening_services = ["bit.ly","tinyurl","goo.gl","t.co","ow.ly","shorte.st","cutt.ly"]
suspicious_keywords = ["secure","account","login","update","free","bonus","ebayisapi",
                       "banking","confirm","signin","verification"]

def is_shortened(url): return int(any(s in url for s in shortening_services))
def has_ip(url): return int(bool(re.match(r"http[s]?://\d+\.\d+\.\d+\.\d+", url)))
def contains_suspicious_word(url): return sum(word in url.lower() for word in suspicious_keywords)

# WHOIS caching function (optional if available)
CACHE_FILE = "whois_cache.db"
def get_whois_safe(domain):
    import shelve
    with shelve.open(CACHE_FILE) as cache:
        if domain in cache:
            return cache[domain]["has_whois"], cache[domain]["domain_age_days"]
        return 0, 0  # default if no WHOIS info

# -------------------------------
# 3️⃣ Feature extraction for prediction
# -------------------------------
def extract_features_for_prediction(url):
    url_norm = normalize_url(url)
    parsed = urlparse(url_norm)
    domain = parsed.netloc
    path = parsed.path or "/"

    has_whois, domain_age_days = get_whois_safe(domain)
    total_special_char = sum(url_norm.count(c) for c in ['@','?','-','=','.','!','#','$','&','~','*','%','+','^','_'])
    path_tokens = [t for t in path.split('/') if t]

    return {
        "url_length": len(url_norm),
        "Shortining_Service": is_shortened(url_norm),
        "having_ip_address": has_ip(url_norm),
        "subdomain_count": max(domain.count(".")-1,0),
        "subdomain_ratio": max(domain.count(".")-1,0)/max(1,len(domain)),
        "path_depth": path.count('/'),
        "path_length": len(path),
        "param_count": parsed.query.count("="),
        "digit_letter_ratio": sum(c.isdigit() for c in url_norm)/max(1,sum(c.isalpha() for c in url_norm)),
        "domain_entropy": shannon_entropy(domain),
        "path_entropy": shannon_entropy(path),
        "total_special_char": total_special_char,
        "special_char_ratio": total_special_char/max(1,len(url_norm)),
        "risky_tld": int(domain.split('.')[-1] in ["zip","xyz","top","club","info"]),
        "tld_length": len(domain.split('.')[-1]),
        "suspicious_word_count": contains_suspicious_word(url_norm),
        "url_upper_ratio": sum(1 for c in url_norm if c.isupper())/max(1,len(url_norm)),
        "repeated_char_count": sum(url_norm.count(c*2) for c in set(url_norm)),
        "path_token_count": len(path_tokens),
        "unique_bigrams": len(set(["_".join(path_tokens[i:i+2]) for i in range(len(path_tokens)-1)])),
        "unique_trigrams": len(set(["_".join(path_tokens[i:i+3]) for i in range(len(path_tokens)-2)])),
        "has_whois": has_whois,
        "domain_age_days": domain_age_days
    }

# -------------------------------
# 4️⃣ Predict a list of URLs
# -------------------------------
urls_to_check = [
    "https://drive.google.com/drive/u/1/my-drive"
]


# Extract features
features_list = [extract_features_for_prediction(u) for u in urls_to_check]
X_pred = pd.DataFrame(features_list)[feature_columns].fillna(0)

# Predict labels and probabilities
y_pred_encoded = rf.predict(X_pred)
y_pred_labels = le.inverse_transform(y_pred_encoded)
y_pred_probs = rf.predict_proba(X_pred)

# Prepare results
results_df = pd.DataFrame({
    "url": urls_to_check,
    "predicted_label": y_pred_labels,
    "prob_safe": y_pred_probs[:, le.transform(['safe'])[0]],
    "prob_malicious": y_pred_probs[:, le.transform(['malicious'])[0]]
})

def risk_level(prob_malicious):
    if prob_malicious < 0.4:
        return "Safe"
    elif prob_malicious <= 0.7:
        return "Medium"
    else:
        return "High"

results_df['risk_level'] = results_df['prob_malicious'].apply(risk_level)

# -------------------------------
# 6️⃣ Display Results
# -------------------------------
print("🔹 Predictions for 10 URLs with Risk Levels:")
print(results_df[['url', 'predicted_label', 'prob_safe', 'prob_malicious', 'risk_level']])


In [None]:
# ===============================
# Batch Predict All URLs and Save CSV (Essential Columns Only)
# ===============================

import pandas as pd
import joblib
from urllib.parse import urlparse, urlunparse
import math
import re

# -------------------------------
# 1️⃣ Load saved Random Forest model & metadata
# -------------------------------
model_dir = r"D:\QRusaderTrainedModel\zzznewTrainingModel\saved_models"

rf = joblib.load(f"{model_dir}/random_forest_model.pkl")
le = joblib.load(f"{model_dir}/label_encoder.pkl")
feature_columns = joblib.load(f"{model_dir}/feature_columns.pkl")

# Load your full dataset
features_df = pd.read_csv(r"D:\QRusaderTrainedModel\zzznewTrainingModel\csvFiles\url_features_detailed.csv")
features_df = features_df.dropna(subset=['label'])

# -------------------------------
# 2️⃣ Helper functions
# -------------------------------
def normalize_url(url: str) -> str:
    if not url.startswith(("http://", "https://")):
        url = "http://" + url
    parsed = urlparse(url)
    domain = parsed.netloc.lower()
    if domain.startswith("www."):
        domain = domain[4:]
    path = parsed.path or "/"
    return urlunparse((parsed.scheme.lower(), domain, path, parsed.params, parsed.query, parsed.fragment))

# WHOIS caching
CACHE_FILE = "whois_cache.db"
def get_whois_safe(domain):
    import shelve
    with shelve.open(CACHE_FILE) as cache:
        if domain in cache:
            return cache[domain]["has_whois"], cache[domain]["domain_age_days"]
        return 0, 0

# Other helpers same as before (entropy, shortened, IP, suspicious words)
def shannon_entropy(data):
    if not data: return 0
    prob = [float(data.count(c))/len(data) for c in set(data)]
    return -sum(p*math.log2(p) for p in prob)

shortening_services = ["bit.ly","tinyurl","goo.gl","t.co","ow.ly","shorte.st","cutt.ly"]
suspicious_keywords = ["secure","account","login","update","free","bonus","ebayisapi",
                       "banking","confirm","signin","verification"]
def is_shortened(url): return int(any(s in url for s in shortening_services))
def has_ip(url): return int(bool(re.match(r"http[s]?://\d+\.\d+\.\d+\.\d+", url)))
def contains_suspicious_word(url): return sum(word in url.lower() for word in suspicious_keywords)

# -------------------------------
# 3️⃣ Feature extraction function
# -------------------------------
def extract_features_for_prediction(url):
    url_norm = normalize_url(url)
    parsed = urlparse(url_norm)
    domain = parsed.netloc
    path = parsed.path or "/"
    has_whois, domain_age_days = get_whois_safe(domain)
    total_special_char = sum(url_norm.count(c) for c in ['@','?','-','=','.','!','#','$','&','~','*','%','+','^','_'])
    path_tokens = [t for t in path.split('/') if t]
    return {
        "url_length": len(url_norm),
        "Shortining_Service": is_shortened(url_norm),
        "having_ip_address": has_ip(url_norm),
        "subdomain_count": max(domain.count(".")-1,0),
        "subdomain_ratio": max(domain.count(".")-1,0)/max(1,len(domain)),
        "path_depth": path.count('/'),
        "path_length": len(path),
        "param_count": parsed.query.count("="),
        "digit_letter_ratio": sum(c.isdigit() for c in url_norm)/max(1,sum(c.isalpha() for c in url_norm)),
        "domain_entropy": shannon_entropy(domain),
        "path_entropy": shannon_entropy(path),
        "total_special_char": total_special_char,
        "special_char_ratio": total_special_char/max(1,len(url_norm)),
        "risky_tld": int(domain.split('.')[-1] in ["zip","xyz","top","club","info"]),
        "tld_length": len(domain.split('.')[-1]),
        "suspicious_word_count": contains_suspicious_word(url_norm),
        "url_upper_ratio": sum(1 for c in url_norm if c.isupper())/max(1,len(url_norm)),
        "repeated_char_count": sum(url_norm.count(c*2) for c in set(url_norm)),
        "path_token_count": len(path_tokens),
        "unique_bigrams": len(set(["_".join(path_tokens[i:i+2]) for i in range(len(path_tokens)-1)])),
        "unique_trigrams": len(set(["_".join(path_tokens[i:i+3]) for i in range(len(path_tokens)-2)])),
        "has_whois": has_whois,
        "domain_age_days": domain_age_days
    }

# -------------------------------
# 4️⃣ Extract features & predict
# -------------------------------
features_list = [extract_features_for_prediction(u) for u in features_df['url_original']]
X_pred = pd.DataFrame(features_list)[feature_columns].fillna(0)

y_pred_encoded = rf.predict(X_pred)
y_pred_labels = le.inverse_transform(y_pred_encoded)
y_pred_probs = rf.predict_proba(X_pred)

# -------------------------------
# 5️⃣ Prepare output DataFrame with essential columns
# -------------------------------
output_df = pd.DataFrame({
    "url_original": features_df['url_original'],
    "normalized_url": features_df['normalized_url'],
    "label": features_df['label'],
    "predicted_label": y_pred_labels,
    "prob_safe": y_pred_probs[:, le.transform(['safe'])[0]],
    "prob_malicious": y_pred_probs[:, le.transform(['malicious'])[0]]
})

# Apply risk levels
def get_risk_level(prob_malicious):
    if prob_malicious <= 0.4: return "Safe"
    elif prob_malicious <= 0.7: return "Medium"
    else: return "High"

output_df['risk_level'] = output_df['prob_malicious'].apply(get_risk_level)

# -------------------------------
# 6️⃣ Save new CSV
# -------------------------------
output_file = r"D:\QRusaderTrainedModel\zzznewTrainingModel\csvFiles\predicted_urls.csv"
output_df.to_csv(output_file, index=False, encoding="utf-8")
print(f"✅ Predictions saved at: {output_file}")
