In [8]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from statistics import mean, stdev


In [9]:
URLdataOG = pd.read_csv(r"C:\Users\Aden\Desktop\Phishing URL RF\URL Phishing\PhiUSIIL_Phishing_URL_Dataset.csv")


URLdataOG.head()
URLdata = URLdataOG.copy()

In [10]:
# Select pre-load features
pre_load_features = [
    'URLLength',
    'DomainLength',
    'NoOfDegitsInURL',
    'NoOfLettersInURL',
    'NoOfSubDomain',
    'TLDLength',
    'NoOfEqualsInURL',
    'NoOfQMarkInURL',
    'NoOfAmpersandInURL'        # Can check from URL
]

In [11]:
X = URLdata[pre_load_features]  # Select only the pre-load features
y = URLdata['label']  

In [12]:
# Use original features and labels directly
X = URLdata[pre_load_features]
y = URLdata['label']

# Define partition ratios
partition_ratios = [
    (0.2, 0.8),  # 20/80
    (0.5, 0.5),  # 50/50
    (0.8, 0.2)   # 80/20
]

# Define hyperparameter search space
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

In [13]:
def run_trial(X, y, train_size, random_state):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, random_state=random_state)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    # Perform RandomizedSearchCV
    rf = RandomForestClassifier(random_state=random_state)
    rf_random = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=20,
        cv=kf,
        random_state=random_state,
        n_jobs=-1
    )
    
    # Fit the model
    rf_random.fit(X_train_scaled, y_train)
    
    # Get scores
    train_score = rf_random.score(X_train_scaled, y_train)
    test_score = rf_random.score(X_test_scaled, y_test)
    val_scores = cross_val_score(rf_random.best_estimator_, X_train_scaled, y_train, cv=kf)
    
    return {
        'train_score': train_score,
        'validation_score': mean(val_scores),
        'test_score': test_score,
        'best_params': rf_random.best_params_
    }

In [14]:
# Run experiments for each partition ratio
results = {}
for train_size, test_size in partition_ratios:
    ratio_results = []
    print(f"\nRunning experiments for {train_size*100:.0f}/{test_size*100:.0f} split")
    
    for trial in range(3):
        print(f"Trial {trial + 1}/3...")
        trial_results = run_trial(X, y, train_size, random_state=42+trial)
        ratio_results.append(trial_results)
    
    # Calculate averages
    avg_results = {
        'avg_train_score': mean([r['train_score'] for r in ratio_results]),
        'avg_validation_score': mean([r['validation_score'] for r in ratio_results]),
        'avg_test_score': mean([r['test_score'] for r in ratio_results]),
        'std_train_score': stdev([r['train_score'] for r in ratio_results]),
        'std_validation_score': stdev([r['validation_score'] for r in ratio_results]),
        'std_test_score': stdev([r['test_score'] for r in ratio_results]),
        'individual_trials': ratio_results
    }
    
    results[f"{train_size*100:.0f}/{test_size*100:.0f}"] = avg_results

# Print results
for split, res in results.items():
    print(f"\nResults for {split} split:")
    print(f"Average Training Score: {res['avg_train_score']:.4f} ± {res['std_train_score']:.4f}")
    print(f"Average Validation Score: {res['avg_validation_score']:.4f} ± {res['std_validation_score']:.4f}")
    print(f"Average Test Score: {res['avg_test_score']:.4f} ± {res['std_test_score']:.4f}")


Running experiments for 20/80 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Running experiments for 50/50 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Running experiments for 80/20 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Results for 20/80 split:
Average Training Score: 0.9325 ± 0.0091
Average Validation Score: 0.9304 ± 0.0079
Average Test Score: 0.9293 ± 0.0091

Results for 50/50 split:
Average Training Score: 0.9328 ± 0.0107
Average Validation Score: 0.9301 ± 0.0109
Average Test Score: 0.9311 ± 0.0099

Results for 80/20 split:
Average Training Score: 0.9300 ± 0.0091
Average Validation Score: 0.9306 ± 0.0087
Average Test Score: 0.9298 ± 0.0088
