In [2]:
import pandas as pd
import zipfile
import random
import gc
import io
import re
import math
import tldextract
from urllib.parse import urlparse

In [3]:

N_SAMPLES = 40000

In [None]:

# Add subdomains to ~50% of URLs
def add_subdomain(domain):
    if random.random() < SUBDOMAIN_PROB:
        subdomain = random.choice(BENIGN_SUBDOMAINS)
        return f"{subdomain}.{domain}"
    return domain

# --- Path to datasets ---
tranco_csv_path = '/datasets/phishing-dataset/top-1m.csv'
urlhaus_csv_path = '/datasets/phishing-dataset/urlhaus.csv'
phishtank_csv_path = '/datasets/phishing-dataset/phishtank.csv'
BENIGN_SUBDOMAINS = ['www', 'mail', 'm', 'app', 'api', 'blog', 'news', 'support', 'drive', 'shop', 'login', 'account']
SEED = 42
SUBDOMAIN_PROB = 0.5  # 50% chance to add a subdomain

# --- Load Good URLs from Tranco ---

try:
    # Read CSV directly
    df_tranco = pd.read_csv(tranco_csv_path, names=['rank', 'domain'], header=None)
    
    # Create df_good with required transformations
    df_good = df_tranco[['domain']].copy()

    random.seed(SEED)  
    df_good['domain'] = df_good['domain'].apply(add_subdomain)
    df_good['url'] = 'http://' + df_good['domain']
    
    # Drop null values
    df_good = df_good.dropna()
    # drop duplicates
    df_good = df_good[['url']].drop_duplicates()
    
    # Sample N records with shuffling
    df_good = df_good.sample(n=N_SAMPLES, random_state=42)
    df_good['label'] = 'good'

except FileNotFoundError:
    print(f"Error: Tranco CSV file not found at {tranco_csv_path}")
    raise
except Exception as e:
    print(f"Error reading Tranco file: {str(e)}")
    raise

# --- Load Bad URLs from URLhaus ---

try:
    
    # Read CSV with explicit header
    with open(urlhaus_csv_path, 'r') as f:
        lines = [line for line in f if not line.startswith('#') and line.strip()]
       
    df_urlhaus = pd.read_csv(io.StringIO(''.join(lines)), low_memory=False, header=0)
    
    # Select the 'url' column if it exists, otherwise fall back to index
    if 'url' in df_urlhaus.columns:
        df_urlhaus = df_urlhaus[['url']].dropna()
    else:
        print("Warning: 'url' column not found. Falling back to index-based selection.")
        url_column_index = 2  # Based on previous output
        if url_column_index < len(df_urlhaus.columns):
            df_urlhaus = df_urlhaus.iloc[:, [url_column_index]].rename(columns={df_urlhaus.columns[url_column_index]: 'url'}).dropna()
        else:
            raise ValueError(f"URL column index {url_column_index} is out of range. Available columns: {df_urlhaus.columns.tolist()}")

except FileNotFoundError:
    print(f"Error: URLhaus CSV file not found at {urlhaus_csv_path}")
    raise
except Exception as e:
    print(f"Error reading URLhaus file: {str(e)}")
    raise

# --- Load Bad URLs from PhishTank ---

try:
    df_phishtank = pd.read_csv(phishtank_csv_path)
    if 'url' in df_phishtank.columns:
        df_phishtank = df_phishtank[['url']].dropna()
    else:
        raise ValueError("PhishTank CSV does not have a 'url' column.")

except FileNotFoundError:
    print(f"Error: PhishTank CSV file not found at {phishtank_csv_path}")
    raise
except Exception as e:
    print(f"Error reading PhishTank file: {str(e)}")
    raise

# --- Combine, shuffle, and select N bad URLs ---
try:
    df_bad = pd.concat([df_urlhaus, df_phishtank], ignore_index=True)
    df_bad = df_bad.drop_duplicates().sample(n=N_SAMPLES, random_state=42)
    df_bad['label'] = 'bad'

except Exception as e:
    print(f"Error combining bad URLs: {str(e)}")
    raise

# --- Free up memory ---
del df_tranco, df_urlhaus, df_phishtank
gc.collect()

# --- Merge good and bad DataFrames ---
try:
    df_all = pd.concat([df_good, df_bad], ignore_index=True)
    df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle the merged DataFrame

    # Print results
    print(df_all.head())
    print(df_all['label'].value_counts())

except Exception as e:
    print(f"Error merging DataFrames: {str(e)}")
    raise

                                                 url label
0                     https://pllsadosaod.pages.dev/   bad
1                             https://bit.ly/4cLJHjK   bad
2                       https://nongenqa.weebly.com/   bad
3  https://docs.google.com/presentation/d/e/2PACX...   bad
4                  http://202.4.110.130:35612/Mozi.m   bad
label
bad     40000
good    40000
Name: count, dtype: int64


In [None]:
SUSPICIOUS_KEYWORDS = [
    'login', 'update', 'free', 'verify', 'secure', 'account', 'bank', 'confirm', 'password', 'signin', 'pay', 'payment'
]
URL_SHORTENERS = [
    'bit.ly', 'goo.gl', 'tinyurl.com', 'ow.ly', 't.co', 'is.gd', 'buff.ly', 'adf.ly', 'bit.do', 'cutt.ly', 'shorte.st'
]
PHISHY_TLDS = ['.xyz', '.top', '.ru', '.tk', '.ml', '.ga', '.cf', '.gq']

# --- Helper functions for each feature ---
def url_length(url):
    return len(url)

def count_dots(url):
    return url.count('.')

def count_subdomains(url):
    ext = tldextract.extract(url)
    if ext.subdomain == '':
        return 0
    return len(ext.subdomain.split('.'))

def has_ip_address(url):
    ipv4 = re.search(r'://(\d{1,3}\.){3}\d{1,3}([/:]|$)', url)
    ipv6 = re.search(r'://\[[0-9a-fA-F:]+\]', url)
    return int(bool(ipv4 or ipv6))

def has_suspicious_keywords(url):
    url_lower = url.lower()
    return int(any(word in url_lower for word in SUSPICIOUS_KEYWORDS))

def count_special_chars(url):
    chars = '-@=_/?&%#'
    return sum(url.count(c) for c in chars)

def has_https(url):
    return int(url.lower().startswith('https://'))

def url_entropy(url):
    prob = [float(url.count(c)) / len(url) for c in set(url)]
    entropy = - sum([p * math.log2(p) for p in prob])
    return entropy

def get_tld(url):
    ext = tldextract.extract(url)
    return '.' + ext.suffix if ext.suffix else ''

def tld_is_phishy(url):
    tld = get_tld(url)
    return int(tld in PHISHY_TLDS)

def path_length(url):
    parsed = urlparse(url)
    return len(parsed.path)

def path_level(url):
    parsed = urlparse(url)
    return len([p for p in parsed.path.split('/') if p])

def uses_url_shortener(url):
    netloc = urlparse(url).netloc.lower()
    return int(any(shortener in netloc for shortener in URL_SHORTENERS))

def has_homograph(url):
    try:
        url.encode('ascii')
        return 0
    except UnicodeEncodeError:
        return 1

def has_multiple_slash_after_domain(url):
    match = re.search(r'^[a-z]+://[^/]+//', url)
    return int(bool(match))

def https_in_hostname(url):
    netloc = urlparse(url).netloc.lower()
    return int('https' in netloc)

def count_numeric_chars(url):
    return sum(c.isdigit() for c in url)

def query_length(url):
    parsed = urlparse(url)
    return len(parsed.query)

def query_component_count(url):
    parsed = urlparse(url)
    if parsed.query == '':
        return 0
    return len(parsed.query.split('&'))

def brand_in_subdomain_or_path(url):
    ext = tldextract.extract(url)
    subdomain = ext.subdomain.lower()
    path = urlparse(url).path.lower()
    for brand in SUSPICIOUS_KEYWORDS:
        if brand in subdomain or brand in path:
            return 1
    return 0

def unusual_subdomains(url):
    return int(count_subdomains(url) > 2)

def extract_lexical_features(df, url_col='url'):
    df_feat = df.copy()
    df_feat['url_length'] = df_feat[url_col].apply(url_length)
    df_feat['num_dots'] = df_feat[url_col].apply(count_dots)
    df_feat['num_subdomains'] = df_feat[url_col].apply(count_subdomains)
    df_feat['has_ip'] = df_feat[url_col].apply(has_ip_address)
    df_feat['has_suspicious_keywords'] = df_feat[url_col].apply(has_suspicious_keywords)
    df_feat['special_char_count'] = df_feat[url_col].apply(count_special_chars)
    df_feat['url_entropy'] = df_feat[url_col].apply(url_entropy)
    df_feat['tld'] = df_feat[url_col].apply(get_tld)
    df_feat['tld_is_phishy'] = df_feat[url_col].apply(tld_is_phishy)
    df_feat['path_level'] = df_feat[url_col].apply(path_level)
    df_feat['uses_shortener'] = df_feat[url_col].apply(uses_url_shortener)
    df_feat['has_homograph'] = df_feat[url_col].apply(has_homograph)
    df_feat['multiple_slash_after_domain'] = df_feat[url_col].apply(has_multiple_slash_after_domain)
    df_feat['https_in_hostname'] = df_feat[url_col].apply(https_in_hostname)
    df_feat['numeric_char_count'] = df_feat[url_col].apply(count_numeric_chars)
    df_feat['query_length'] = df_feat[url_col].apply(query_length)
    df_feat['query_component_count'] = df_feat[url_col].apply(query_component_count)
    df_feat['brand_in_subdomain_or_path'] = df_feat[url_col].apply(brand_in_subdomain_or_path)
    df_feat['unusual_subdomains'] = df_feat[url_col].apply(unusual_subdomains)
    return df_feat

In [None]:
df = extract_lexical_features(df_all)

In [None]:
print(df.head())

                                                 url label  url_length  \
0                     https://pllsadosaod.pages.dev/   bad          30   
1                             https://bit.ly/4cLJHjK   bad          22   
2                       https://nongenqa.weebly.com/   bad          28   
3  https://docs.google.com/presentation/d/e/2PACX...   bad         190   
4                  http://202.4.110.130:35612/Mozi.m   bad          33   

   num_dots  num_subdomains  has_ip  has_suspicious_keywords  \
0         2               1       0                        0   
1         1               0       0                        0   
2         2               1       0                        0   
3         3               1       0                        0   
4         4               0       1                        0   

   special_char_count  url_entropy   tld  ...  path_level  uses_shortener  \
0                   3     3.672906  .dev  ...           0               0   
1               

Random Forest Accuracy: 0.968
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      7916
           1       0.96      0.97      0.97      8084

    accuracy                           0.97     16000
   macro avg       0.97      0.97      0.97     16000
weighted avg       0.97      0.97      0.97     16000

Confusion Matrix for Random Forest:
[[7610  306]
 [ 206 7878]]
Feature Importances for Random Forest:
special_char_count             0.455936
path_level                     0.166501
url_length                     0.142702
url_entropy                    0.134867
numeric_char_count             0.045470
num_dots                       0.015351
num_subdomains                 0.011961
query_component_count          0.005451
has_ip                         0.005273
tld_is_phishy                  0.004799
has_suspicious_keywords        0.004139
uses_shortener                 0.002886
brand_in_subdomain_o

In [None]:
# # --- Model Building ---

# # Import necessary libraries
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# from catboost import CatBoostClassifier
# import pandas as pd

# # Prepare features and labels
# feature_cols = [col for col in df.columns if col not in ['url', 'label', 'tld']]
# X = df[feature_cols]
# y = df['label'].map({'good': 0, 'bad': 1})

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # --- Random Forest Model ---
# rf = RandomForestClassifier(n_estimators=100, random_state=42)
# rf.fit(X_train, y_train)
# y_pred_rf = rf.predict(X_test)
# print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
# print("Classification Report for Random Forest:")
# print(classification_report(y_test, y_pred_rf))
# print("Confusion Matrix for Random Forest:")
# print(confusion_matrix(y_test, y_pred_rf))

# # Feature importance for Random Forest
# importances_rf = rf.feature_importances_
# feature_importance_rf = pd.Series(importances_rf, index=feature_cols).sort_values(ascending=False)
# print("Feature Importances for Random Forest:")
# print(feature_importance_rf)

# # --- CatBoost Model ---
# cb = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_state=42, verbose=0)
# cb.fit(X_train, y_train)
# y_pred_cb = cb.predict(X_test)
# print("CatBoost Accuracy:", accuracy_score(y_test, y_pred_cb))
# print("Classification Report for CatBoost:")
# print(classification_report(y_test, y_pred_cb))
# print("Confusion Matrix for CatBoost:")
# print(confusion_matrix(y_test, y_pred_cb))

# # Feature importance for CatBoost
# importances_cb = cb.get_feature_importance()
# feature_importance_cb = pd.Series(importances_cb, index=feature_cols).sort_values(ascending=False)
# print("Feature Importances for CatBoost:")
# print(feature_importance_cb)

# # --- Optional: CatBoost with TLD as categorical feature ---
# feature_cols_with_tld = [col for col in df.columns if col not in ['url', 'label']]
# X_with_tld = df[feature_cols_with_tld]
# cat_features = ['tld']
# X_train_tld, X_test_tld, y_train_tld, y_test_tld = train_test_split(X_with_tld, y, test_size=0.2, random_state=42)
# cb_with_tld = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_state=42, cat_features=cat_features, verbose=0)
# cb_with_tld.fit(X_train_tld, y_train_tld)
# y_pred_cb_tld = cb_with_tld.predict(X_test_tld)
# print("CatBoost with TLD Accuracy:", accuracy_score(y_test_tld, y_pred_cb_tld))
# print("Classification Report for CatBoost with TLD:")
# print(classification_report(y_test_tld, y_pred_cb_tld))
# print("Confusion Matrix for CatBoost with TLD:")
# print(confusion_matrix(y_test_tld, y_pred_cb_tld))

# # --- Hyperparameter Tuning for Random Forest ---
# param_grid_rf = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [10, 20],
#     'min_samples_split': [2, 5]
# }
# grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy')
# grid_search_rf.fit(X_train, y_train)
# print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
# print("Best Cross-Validation Accuracy for Random Forest:", grid_search_rf.best_score_)
# best_rf = grid_search_rf.best_estimator_
# y_pred_best_rf = best_rf.predict(X_test)
# print("Tuned Random Forest Accuracy:", accuracy_score(y_test, y_pred_best_rf))
# print("Classification Report for Tuned Random Forest:")
# print(classification_report(y_test, y_pred_best_rf))

# # --- Hyperparameter Tuning for CatBoost ---
# param_grid_cb = {
#     'iterations': [500, 1000],
#     'learning_rate': [0.01, 0.1],
#     'depth': [4, 6]
# }
# grid_search_cb = GridSearchCV(CatBoostClassifier(random_state=42, verbose=0), param_grid_cb, cv=5, scoring='accuracy')
# grid_search_cb.fit(X_train, y_train)
# print("Best Parameters for CatBoost:", grid_search_cb.best_params_)
# print("Best Cross-Validation Accuracy for CatBoost:", grid_search_cb.best_score_)
# best_cb = grid_search_cb.best_estimator_
# y_pred_best_cb = best_cb.predict(X_test)
# print("Tuned CatBoost Accuracy:", accuracy_score(y_test, y_pred_best_cb))
# print("Classification Report for Tuned CatBoost:")
# print(classification_report(y_test, y_pred_best_cb))

# # for performance metrics consider accuracy, preceision, recall training time 

In [None]:
print(X_test)
cc = rf.predict(X_test)
print(cc)
dd = cb.predict(X_test)
print(dd)

       url_length  num_dots  num_subdomains  has_ip  has_suspicious_keywords  \
47044          15         1               0       0                        0   
44295          34         4               0       1                        0   
74783          24         2               1       0                        0   
70975          58         2               1       0                        0   
46645          26         2               1       0                        0   
...           ...       ...             ...     ...                      ...   
67666          38         2               1       0                        0   
51146          24         2               1       0                        0   
42494          13         1               0       0                        0   
52517          37         4               0       1                        0   
7754           23         1               0       0                        0   

       special_char_count  url_entropy 

In [None]:
# --- Model Building ---

# Import necessary libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier
import pandas as pd
from scipy.stats import randint, uniform

print('a')
# Prepare features and labels
feature_cols = [col for col in df.columns if col not in ['url', 'label', 'tld']]
print('a')
X = df[feature_cols]
print('a')
y = df['label'].map({'good': 0, 'bad': 1})

print('a')
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('a')

# --- Random Forest Model ---
print('a')
rf = RandomForestClassifier(n_estimators=100, random_state=42)
print('a')
rf.fit(X_train, y_train)
print('a')
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, y_pred_rf))

# --- Decision Tree Model ---
from sklearn.tree import DecisionTreeClassifier
print('a')
dt = DecisionTreeClassifier(max_depth=20, min_samples_split=5,random_state=42)
print('a')
dt.fit(X_train, y_train)
print('a')
y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report for Decision Tree:")
print('a')
print(classification_report(y_test, y_pred_dt))
print('a')
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test, y_pred_dt))

# Feature importance for Decision Tree
importances_dt = dt.feature_importances_
feature_importance_dt = pd.Series(importances_dt, index=feature_cols).sort_values(ascending=False)
print("Feature Importances for Decision Tree:")
print(feature_importance_dt)


# Feature importance for Random Forest
importances_rf = rf.feature_importances_
feature_importance_rf = pd.Series(importances_rf, index=feature_cols).sort_values(ascending=False)
print("Feature Importances for Random Forest:")
print(feature_importance_rf)

# --- CatBoost Model ---
cb = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_state=42, verbose=0)
cb.fit(X_train, y_train)
y_pred_cb = cb.predict(X_test)
print("CatBoost Accuracy:", accuracy_score(y_test, y_pred_cb))
print("Classification Report for CatBoost:")
print(classification_report(y_test, y_pred_cb))
print("Confusion Matrix for CatBoost:")
print(confusion_matrix(y_test, y_pred_cb))

# Feature importance for CatBoost
importances_cb = cb.get_feature_importance()
feature_importance_cb = pd.Series(importances_cb, index=feature_cols).sort_values(ascending=False)
print("Feature Importances for CatBoost:")
print(feature_importance_cb)

# --- Optional: CatBoost with TLD as categorical feature ---
feature_cols_with_tld = [col for col in df.columns if col not in ['url', 'label']]
X_with_tld = df[feature_cols_with_tld]
cat_features = ['tld']
X_train_tld, X_test_tld, y_train_tld, y_test_tld = train_test_split(X_with_tld, y, test_size=0.2, random_state=42)
cb_with_tld = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_state=42, cat_features=cat_features, verbose=0)
cb_with_tld.fit(X_train_tld, y_train_tld)
y_pred_cb_tld = cb_with_tld.predict(X_test_tld)
print("CatBoost with TLD Accuracy:", accuracy_score(y_test_tld, y_pred_cb_tld))
print("Classification Report for CatBoost with TLD:")
print(classification_report(y_test_tld, y_pred_cb_tld))
print("Confusion Matrix for CatBoost with TLD:")
print(confusion_matrix(y_test_tld, y_pred_cb_tld))

# --- Hyperparameter Tuning for Random Forest using Randomized Search ---
param_dist_rf = {
    'n_estimators': randint(50, 300),          # Random integers between 50-299
    'max_depth': randint(10, 30),              # Random integers between 10-29  
    'min_samples_split': randint(2, 10),       # Random integers between 2-9
    'min_samples_leaf': randint(1, 5),         # Additional parameter for more exploration
    'max_features': ['sqrt', 'log2', None]     # Categorical choices
}

random_search_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42), 
    param_distributions=param_dist_rf, 
    n_iter=5,                    # Number of parameter combinations to try
    cv=5, 
    scoring='accuracy',
    random_state=42,
    n_jobs=-1                     # Use all available cores
)
random_search_rf.fit(X_train, y_train)
print("Best Parameters for Random Forest:", random_search_rf.best_params_)
print("Best Cross-Validation Accuracy for Random Forest:", random_search_rf.best_score_)
best_rf = random_search_rf.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
print("Tuned Random Forest Accuracy:", accuracy_score(y_test, y_pred_best_rf))
print("Classification Report for Tuned Random Forest:")
print(classification_report(y_test, y_pred_best_rf))

# --- Hyperparameter Tuning for CatBoost using Randomized Search ---
param_dist_cb = {
    'iterations': randint(300, 1500),          # Random integers between 300-1499
    'learning_rate': uniform(0.01, 0.19),      # Uniform distribution between 0.01-0.2
    'depth': randint(3, 10),                   # Random integers between 3-9
    'l2_leaf_reg': uniform(1, 9),              # L2 regularization between 1-10
    'border_count': randint(32, 256)           # Additional parameter for exploration
}

random_search_cb = RandomizedSearchCV(
    CatBoostClassifier(random_state=42, verbose=0), 
    param_distributions=param_dist_cb, 
    n_iter=5,                    # Number of parameter combinations to try
    cv=5, 
    scoring='accuracy',
    random_state=42,
    n_jobs=-1                     # Use all available cores
)
random_search_cb.fit(X_train, y_train)
print("Best Parameters for CatBoost:", random_search_cb.best_params_)
print("Best Cross-Validation Accuracy for CatBoost:", random_search_cb.best_score_)
best_cb = random_search_cb.best_estimator_
y_pred_best_cb = best_cb.predict(X_test)
print("Tuned CatBoost Accuracy:", accuracy_score(y_test, y_pred_best_cb))
print("Classification Report for Tuned CatBoost:")
print(classification_report(y_test, y_pred_best_cb))


# --- Hyperparameter Tuning for Decision Tree using Randomized Search 

param_dist_dt = {
  'max_depth': randint(5, 30),               # Random integers between 5-29
  'min_samples_split': randint(2, 20),       # Random integers between 2-19
  'min_samples_leaf': randint(1, 10),        # Random integers between 1-9
  'max_features': ['sqrt', 'log2', None],    # Categorical choices
  'criterion': ['gini', 'entropy']           # Splitting criteria
}

random_search_dt = RandomizedSearchCV(
  DecisionTreeClassifier(random_state=42),
  param_distributions=param_dist_dt,
  n_iter=50,                    # Number of parameter combinations to try
  cv=5,
  scoring='accuracy',
  random_state=42,
  n_jobs=-1                     # Use all available cores
)
random_search_dt.fit(X_train, y_train)
print("Best Parameters for Decision Tree:",
random_search_dt.best_params_)
print("Best Cross-Validation Accuracy for Decision Tree:",
random_search_dt.best_score_)
best_dt = random_search_dt.best_estimator_
y_pred_best_dt = best_dt.predict(X_test)
print("Tuned Decision Tree Accuracy:", accuracy_score(y_test,
y_pred_best_dt))
print("Classification Report for Tuned Decision Tree:")
print(classification_report(y_test, y_pred_best_dt))

# for performance metrics consider accuracy, precision, recall training time

In [None]:


def predict_url(url, rf_model, cb_model, cb_tld_model):
    """Predict if a URL is malicious using trained Random Forest and CatBoost models, returning probabilities as percentages."""
    try:
        # Add protocol if missing (consistent with Tranco preprocessing)
        if not url.startswith(('http://', 'https://')):
            url = 'http://' + url
        
        # Extract features using existing logic
        df_features = extract_lexical_features(pd.DataFrame({'url': [url]}))
        
        # Feature columns for Random Forest and CatBoost (without TLD)
        feature_cols = [col for col in df_features.columns if col not in ['url', 'tld']]
        X = df_features[feature_cols]
        
        # Feature columns for CatBoost with TLD
        feature_cols_tld = [col for col in df_features.columns if col not in ['url']]
        X_tld = df_features[feature_cols_tld]
        
        # Make predictions
        rf_pred = rf_model.predict(X)[0]
        rf_prob = rf_model.predict_proba(X)[0][1] * 100  # Convert to percentage
        cb_pred = cb_model.predict(X)[0]
        cb_prob = cb_model.predict_proba(X)[0][1] * 100
        cb_tld_pred = cb_tld_model.predict(X_tld)[0]
        cb_tld_prob = cb_tld_model.predict_proba(X_tld)[0][1] * 100
        
        # Map predictions to labels
        label_map = {0: 'good', 1: 'bad'}
        rf_label = label_map[rf_pred]
        cb_label = label_map[cb_pred]
        cb_tld_label = label_map[cb_tld_pred]
        
        # Return results
        results = {
            'url': url,
            'random_forest': {
                'prediction': rf_label,
                'malicious_probability_percent': float(rf_prob)
            },
            'catboost': {
                'prediction': cb_label,
                'malicious_probability_percent': float(cb_prob)
            },
            'catboost_with_tld': {
                'prediction': cb_tld_label,
                'malicious_probability_percent': float(cb_tld_prob)
            }
        }
        return results
    
    except Exception as e:
        print(f"Error predicting URL {url}: {str(e)}")
        raise


# --- we will use this url to test with first ---
urls = [
    'www.google.com',
    'google.com',
    'docs.google.com',
    'https://google.com',
    'http://google.com',
    'http://www.google.com',
    'eportal.sec.gov.ng',
    'mail.sec.gov.ng',
    'sec.gov.ng',
    'www.sec.gov.ng',
    'http://sec.gov.ng',
    'https://sec.gov.ng',
    'https://www.sec.gov.ng',
    'fgacebook.com', 
    'https://fr-relais-lockers.com/check/calcul.php', 
    'http://56b5d4xg6fwn6hfn45hfn56h.dyndns.org', 
    'https://dlkzduilddnduoio6578.cfolks.pl/ai/auth/log.php',
    'tiktokv.com',
    'e2ro.com',
    'westbrookhistoricalsociety.org'
]
for url in urls:
    result = predict_url(url, rf, cb, cb_with_tld)
    print(f"\nPredictions for {result['url']}:")
    print(f"Random Forest: {result['random_forest']['prediction']} (Malicious Probability: {result['random_forest']['malicious_probability_percent']:.2f}%)")
    print(f"CatBoost: {result['catboost']['prediction']} (Malicious Probability: {result['catboost']['malicious_probability_percent']:.2f}%)")
    print(f"CatBoost with TLD: {result['catboost_with_tld']['prediction']} (Malicious Probability: {result['catboost_with_tld']['malicious_probability_percent']:.2f}%)")


Predictions for http://www.google.com:
Random Forest: good (Malicious Probability: 0.00%)
CatBoost: good (Malicious Probability: 0.14%)
CatBoost with TLD: good (Malicious Probability: 0.06%)

Predictions for http://google.com:
Random Forest: good (Malicious Probability: 1.28%)
CatBoost: good (Malicious Probability: 0.60%)
CatBoost with TLD: good (Malicious Probability: 0.96%)

Predictions for http://docs.google.com:
Random Forest: good (Malicious Probability: 4.73%)
CatBoost: good (Malicious Probability: 0.59%)
CatBoost with TLD: good (Malicious Probability: 0.12%)

Predictions for https://google.com:
Random Forest: good (Malicious Probability: 0.00%)
CatBoost: good (Malicious Probability: 0.36%)
CatBoost with TLD: good (Malicious Probability: 0.49%)

Predictions for http://google.com:
Random Forest: good (Malicious Probability: 1.28%)
CatBoost: good (Malicious Probability: 0.60%)
CatBoost with TLD: good (Malicious Probability: 0.96%)

Predictions for http://www.google.com:
Random For

In [None]:
# --- Save Trained Models ---
import joblib

# Save Random Forest model
joblib.dump(rf, 'rf_model.joblib')

# Save CatBoost models
cb.save_model('cb_model.cbm')
cb_with_tld.save_model('cb_tld_model.cbm')

print("Models saved as rf_model.joblib, cb_model.cbm, cb_tld_model.cbm")

Models saved as rf_model.joblib, cb_model.cbm, cb_tld_model.cbm
