In [1]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from cuml.svm import SVC as CumlSVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import urllib.parse
import math 
from collections import Counter
import numpy as np
import re

In [2]:
data1 = pd.read_csv('urldata.csv')
data2 = pd.read_csv('malicious_phish.csv')

# Mapping 2nd sets multiple types just to malicious
mapping = {'phishing' : 'malicious', 'defacement' : 'malicious','malware' : 'malicious',}
data2['type'] = data2['type'].replace(mapping)
mapping2 = {'malicious': 1, 'benign': 0}
data2['result'] = data2['type'].replace(mapping2).astype(int)

# Combine cleanly
data1.drop(['Unnamed: 0', 'label'],axis=1, inplace=True)
data2.drop(['type'],axis=1, inplace=True)
data = pd.concat([data1, data2],ignore_index=True)
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data.reset_index(inplace=True,drop=True)

# Undersampling
rus = RandomUnderSampler(random_state=42)
x_url = data[['url']]
y = data['result']
x_resampled, y_resampled = rus.fit_resample(x_url,y)
data = pd.concat([x_resampled,y_resampled],axis=1)
data.reset_index(inplace=True,drop=True)

# Features:

# 1: Length of URL
data['char'] = data['url'].str.len()

# 2: Number of Query Parameters
def count_query_params(url):
    try:
        query_string = urllib.parse.urlparse(url).query
        if not query_string:
            return 0
        return len(query_string.split('&'))
    except:
        return 0
data['queries'] = data['url'].apply(count_query_params).astype(int)

#3 Number of Subdomains
def count_subdomains(url):
    try:
        if not url.startswith(('http://', 'https://')):
            url = 'http://' + url
        netloc = urllib.parse.urlparse(url).netloc
        hostname = netloc.split(':')[0]
        return hostname.count('.')
    except:
        return 0
data['num_subdomains'] = data['url'].apply(count_subdomains)

# 4: Shannon Entropy of the URL
def shannon(url):
    if not url:
        return 0
    # Count character frequencies
    counts = Counter(url)
    # Calculate probabilities
    probabilities = [float(count) / len(url) for count in counts.values()]
    entropy = -sum(p * math.log2(p) for p in probabilities)
    return entropy
data['entropy'] = data['url'].apply(shannon)

#5 Number of Delimiters in the URL
def count_delimiters(url):
    # counts the slashes, hyphens, and periods
    delimiters = ['/', '-', '.']
    count = sum(url.count(d) for d in delimiters)
    return count
data['num_delimiters'] = data['url'].apply(count_delimiters).astype(int)


def count_special_chars(url):
    # count of non-alphanumeric characters in the url
    return len(re.findall(r'[^a-zA-Z0-9]', url))
data['num_special'] = data['url'].apply(count_special_chars).astype(int)

  data2['result'] = data2['type'].replace(mapping2).astype(int)


In [3]:
# 80/10/10 Split
x = data[['char', 'queries', 'num_subdomains', 'entropy', 'num_delimiters', 'num_special']]
y = data['result']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y) # dont think stratify is necessary but why not
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
print(f"Train samples: {len(x_train)}, Validation samples: {len(x_val)}, Test samples: {len(x_test)}")

Train samples: 506254, Validation samples: 63282, Test samples: 63282


In [4]:
# --- Scaling (Essential for RBF Kernel) ---
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

In [None]:
# --- GPU-Accelerated SVC RBF Kernel Grid Tuning (C and Gamma) ---

# Define the grid for C (Regularization) and gamma (Kernel Width)
C_val = 1.0 # 1.0 is very close to other values, and is much faster
gamma_values = [10, 20, 30, 40, 50]

best_macro_f1 = 0
best_c = 0
best_gamma = 0
print(f"\n--- SVC RBF Grid Tuning (C: {C_val}, Gamma: {gamma_values}) ---")


for gamma_val in gamma_values:
    rbf_svm = CumlSVC(kernel='rbf', C=C_val, gamma=gamma_val, random_state=42, cache_size=4096)
    
    rbf_svm.fit(X=x_train_scaled, y=y_train)

    # Predict on Validation Set
    val_pred = rbf_svm.predict(x_val_scaled)
    
    # Calculate key metrics
    val_accuracy = accuracy_score(y_val, val_pred)
    report = classification_report(y_val, val_pred, output_dict=True, zero_division=0)
    val_macro_f1 = report['macro avg']['f1-score']
    
    print(f"Results (C={C_val}, Gamma={gamma_val}): Acc={val_accuracy:.4f}, F1={val_macro_f1:.4f}")
    
    if val_macro_f1 > best_macro_f1:
        best_macro_f1 = val_macro_f1
        best_c = C_val
        best_gamma = gamma_val

print(f"\nBest Hyperparameters: C={best_c}, Gamma={best_gamma}. Best Macro F1: {best_macro_f1:.4f}")

In [5]:
# --- Final Evaluation (Using Optimal C and Gamma) ---

final_C_value = 1
final_gamma_value = 10

# Use CumlSVC for the final model
final_rbf_svm = CumlSVC(kernel='rbf', C=final_C_value, gamma=final_gamma_value, random_state=42, cache_size=4096)
final_rbf_svm.fit(X=x_train_scaled, y=y_train)

# Test Set Evaluation
test_pred = final_rbf_svm.predict(x_test_scaled)

print("\n--- FINAL TEST RESULTS (Optimal C and Gamma) ---")
print("Accuracy:", accuracy_score(y_test, test_pred))
print("Classification Report:\n", classification_report(y_test, test_pred, zero_division=0))



--- FINAL TEST RESULTS (Optimal C and Gamma) ---
Accuracy: 0.8143389905502355
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.82     31641
           1       0.83      0.80      0.81     31641

    accuracy                           0.81     63282
   macro avg       0.81      0.81      0.81     63282
weighted avg       0.81      0.81      0.81     63282

