In [2]:
from sklearn import datasets
import pandas as pd
import numpy as np  
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV  
from statistics import mean, stdev 



In [3]:
URLdataOG = pd.read_csv(r"C:\Users\Aden\Desktop\Phishing URL RF\URL Phishing\PhiUSIIL_Phishing_URL_Dataset.csv")


URLdataOG.head()
URLdata = URLdataOG.copy()

In [4]:
# Select pre-load features
pre_load_features = [
    'URLLength',
    'DomainLength',
    'NoOfDegitsInURL',
    'NoOfLettersInURL',
    'NoOfSubDomain',
    'TLDLength',
    'NoOfEqualsInURL',
    'NoOfQMarkInURL',
    'NoOfAmpersandInURL'        # Can check from URL
]

In [5]:
X = URLdata[pre_load_features]  # Select only the pre-load features
y = URLdata['label']  

# Define partition ratios
partition_ratios = [
    (0.2, 0.8),  # 20/80
    (0.5, 0.5),  # 50/50
    (0.8, 0.2)   # 80/20
]

# Define hyperparameter search space
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'max_iter': [1000]  # Keep constant to focus on C
}

In [10]:
def run_trial(X, y, train_size, random_state):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, random_state=random_state)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    # Find best parameters using GridSearchCV
    svm = LinearSVC(dual=False)
    grid_search = GridSearchCV(
        svm,
        param_grid,
        cv=kf,
        scoring='accuracy',
        n_jobs=-1
    )
    
    # Fit the model
    grid_search.fit(X_train_scaled, y_train)
    
    # Get best model
    best_svm = grid_search.best_estimator_
    
    # Get scores
    train_score = best_svm.score(X_train_scaled, y_train)
    test_score = best_svm.score(X_test_scaled, y_test)
    val_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=kf)
    
    return {
        'train_score': train_score,
        'validation_score': mean(val_scores),
        'test_score': test_score,
        'best_params': grid_search.best_params_
    }


In [11]:
# Run experiments for each partition ratio
results = {}
for train_size, test_size in partition_ratios:
    ratio_results = []
    print(f"\nRunning experiments for {train_size*100:.0f}/{test_size*100:.0f} split")
    
    for trial in range(3):
        print(f"Trial {trial + 1}/3...")
        trial_results = run_trial(X, y, train_size, random_state=42+trial)
        ratio_results.append(trial_results)
    
    # Calculate averages
    avg_results = {
        'avg_train_score': mean([r['train_score'] for r in ratio_results]),
        'avg_validation_score': mean([r['validation_score'] for r in ratio_results]),
        'avg_test_score': mean([r['test_score'] for r in ratio_results]),
        'std_train_score': stdev([r['train_score'] for r in ratio_results]),
        'std_validation_score': stdev([r['validation_score'] for r in ratio_results]),
        'std_test_score': stdev([r['test_score'] for r in ratio_results]),
        'individual_trials': ratio_results
    }
    
    results[f"{train_size*100:.0f}/{test_size*100:.0f}"] = avg_results
    
    
    # Print results


for split, res in results.items():
    print(f"\nResults for {split} split:")
    print(f"Average Training Score: {res['avg_train_score']:.4f} ± {res['std_train_score']:.4f}")
    print(f"Average Validation Score: {res['avg_validation_score']:.4f} ± {res['std_validation_score']:.4f}")
    print(f"Average Test Score: {res['avg_test_score']:.4f} ± {res['std_test_score']:.4f}")
    
    # Print best parameters for each trial
    print("\nBest parameters for each trial:")
    for i, trial in enumerate(res['individual_trials']):
        print(f"Trial {i+1}: {trial['best_params']}")


Running experiments for 20/80 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Running experiments for 50/50 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Running experiments for 80/20 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Results for 20/80 split:
Average Training Score: 0.9416 ± 0.0061
Average Validation Score: 0.9412 ± 0.0057
Average Test Score: 0.9405 ± 0.0061

Best parameters for each trial:
Trial 1: {'C': 100, 'max_iter': 1000}
Trial 2: {'C': 100, 'max_iter': 1000}
Trial 3: {'C': 100, 'max_iter': 1000}

Results for 50/50 split:
Average Training Score: 0.9398 ± 0.0018
Average Validation Score: 0.9393 ± 0.0013
Average Test Score: 0.9399 ± 0.0027

Best parameters for each trial:
Trial 1: {'C': 100, 'max_iter': 1000}
Trial 2: {'C': 100, 'max_iter': 1000}
Trial 3: {'C': 100, 'max_iter': 1000}

Results for 80/20 split:
Average Training Score: 0.9367 ± 0.0006
Average Validation Score: 0.9361 ± 0.0011
Average Test Score: 0.9380 ± 0.0005

Best parameters for each trial:
Trial 1: {'C'