In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from urllib.parse import urlparse
import re
import pickle
import os

In [2]:
# --- Load the dataset ---
data = pd.read_csv('HiddenFraudulentURLs.csv', delimiter=';')

In [3]:
# --- Feature Engineering ---
def extract_hostname(url):
    try:
        hostname = re.findall(r'://([^/]+)/?', url)[0]
        return hostname
    except:
        return ""

def extract_domain(url):
    try:
        domain = re.findall(r'://(?:www\.)?([^/]+)/?', url)[0]
        return domain
    except:
        return ""


In [4]:
def preprocess_url(url):
    features = []
    # URL Length
    features.append(len(url))

    # Count of special characters
    special_chars = ['@', '-', '_', '=', '?', '&', '%', '$', '!', '*', '(', ')', '[', ']', '{', '}', '|', ';', ':', '"', ',', '<', '>', '/', '\\']
    features.append(sum(url.count(char) for char in special_chars))

    # Presence of IP Address in URL
    ip_pattern = r'(\d{1,3}\.){3}\d{1,3}'
    features.append(1 if re.search(ip_pattern, url) else 0)

    # Count of subdomains
    parsed_url = urlparse(url)
    subdomains = parsed_url.netloc.split('.')
    features.append(len(subdomains) - 2 if len(subdomains) > 2 else 0)

    # Presence of "https" in URL
    features.append(1 if parsed_url.scheme == 'https' else 0)

    # Presence of suspicious keywords in URL
    suspicious_words = ['login', 'secure', 'bank', 'account', 'verify', 'update', 'free', 'gift', 'password', 'confirm', 'click', 'urgent', 'limited']
    features.append(sum(1 for word in suspicious_words if word in url.lower()))

    # Hostname Length
    features.append(len(extract_hostname(url)))

    # Domain Length
    features.append(len(extract_domain(url)))

    # Path Length
    features.append(len(parsed_url.path))

    return np.array(features).reshape(1, -1)

In [5]:
# Apply preprocessing to create features
X = []
for url in data['url']:
    X.append(preprocess_url(url))
X = np.concatenate(X, axis=0)  # Convert list of arrays to a single NumPy array

y = data['isHiddenFraudulent'].astype(int)  # Convert boolean to integer (0 or 1)


In [6]:
# --- Split data into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# --- Scale the features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use the same scaler fitted on training data


In [8]:
# --- Train the model ---
model = RandomForestClassifier(random_state=42) # You can adjust hyperparameters here
model.fit(X_train_scaled, y_train)

In [9]:
# --- Evaluate the model ---
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     35283
           1       0.89      0.61      0.73      1753

    accuracy                           0.98     37036
   macro avg       0.93      0.80      0.86     37036
weighted avg       0.98      0.98      0.98     37036

Accuracy: 0.9780213845987688


In [10]:
# --- Save the model and scaler ---
MODELS_DIR = "models"  # Directory to save models
os.makedirs(MODELS_DIR, exist_ok=True) # Create the directory if it doesn't exist

with open(os.path.join(MODELS_DIR, 'fraudulent_url_model.pkl'), 'wb') as f:
    pickle.dump(model, f)

with open(os.path.join(MODELS_DIR, 'fraudulent_url_scaler.pkl'), 'wb') as f:
    pickle.dump(scaler, f)

print("Model and scaler saved successfully!")

Model and scaler saved successfully!
