In [2]:
# Data Processing
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.stats import randint
from statistics import mean, stdev
# For reading .arff file
from scipy.io import arff

In [3]:
# Load the ARFF file
data, meta = arff.loadarff('PhishingData.arff')

# Convert to pandas DataFrame
df = pd.DataFrame(data)

for column in df.columns:
    df[column] = df[column].astype(str).str.replace("b'", "").str.replace("'", "").astype(int)
    
# Display first few rows and basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()



Dataset Shape: (1353, 10)

First few rows:


Unnamed: 0,SFH,popUpWidnow,SSLfinal_State,Request_URL,URL_of_Anchor,web_traffic,URL_Length,age_of_domain,having_IP_Address,Result
0,1,-1,1,-1,-1,1,1,1,0,0
1,-1,-1,-1,-1,-1,0,1,1,1,1
2,1,-1,0,0,-1,0,-1,1,0,1
3,1,0,1,-1,-1,0,1,1,0,0
4,-1,-1,1,-1,0,0,-1,1,0,1


In [4]:
df.dtypes


SFH                  int64
popUpWidnow          int64
SSLfinal_State       int64
Request_URL          int64
URL_of_Anchor        int64
web_traffic          int64
URL_Length           int64
age_of_domain        int64
having_IP_Address    int64
Result               int64
dtype: object

In [5]:
print("Number of NaN values in Result column:", df['Result'].isna().sum())
print("Total NaN values in dataset:", df.isna().sum().sum())

Number of NaN values in Result column: 0
Total NaN values in dataset: 0


In [6]:
X = df.drop('Result', axis=1)  # Assuming 'Result' is the target column
y = df['Result']  # Remove the mapping since data is already numeric

In [7]:
# Define partition ratios
partition_ratios = [
    (0.2, 0.8),  # 20/80
    (0.5, 0.5),  # 50/50
    (0.8, 0.2)   # 80/20
]

# Define hyperparameter search space
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

In [8]:
def run_trial(X, y, train_size, random_state):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, random_state=random_state)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    # Perform RandomizedSearchCV
    rf = RandomForestClassifier(random_state=random_state)
    rf_random = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=20,
        cv=kf,
        random_state=random_state,
        n_jobs=-1
    )
    
    # Fit the model
    rf_random.fit(X_train_scaled, y_train)
    
    # Get scores
    train_score = rf_random.score(X_train_scaled, y_train)
    test_score = rf_random.score(X_test_scaled, y_test)
    val_scores = cross_val_score(rf_random.best_estimator_, X_train_scaled, y_train, cv=kf)
    
    return {
        'train_score': train_score,
        'validation_score': mean(val_scores),
        'test_score': test_score,
        'best_params': rf_random.best_params_
    }

In [9]:
# Run experiments for each partition ratio
results = {}
for train_size, test_size in partition_ratios:
    ratio_results = []
    print(f"\nRunning experiments for {train_size*100:.0f}/{test_size*100:.0f} split")
    
    for trial in range(3):
        print(f"Trial {trial + 1}/3...")
        trial_results = run_trial(X, y, train_size, random_state=42+trial)
        ratio_results.append(trial_results)
    
    # Calculate averages
    avg_results = {
        'avg_train_score': mean([r['train_score'] for r in ratio_results]),
        'avg_validation_score': mean([r['validation_score'] for r in ratio_results]),
        'avg_test_score': mean([r['test_score'] for r in ratio_results]),
        'std_train_score': stdev([r['train_score'] for r in ratio_results]),
        'std_validation_score': stdev([r['validation_score'] for r in ratio_results]),
        'std_test_score': stdev([r['test_score'] for r in ratio_results]),
        'individual_trials': ratio_results
    }
    
    results[f"{train_size*100:.0f}/{test_size*100:.0f}"] = avg_results

# Print results
for split, res in results.items():
    print(f"\nResults for {split} split:")
    print(f"Average Training Score: {res['avg_train_score']:.4f} ± {res['std_train_score']:.4f}")
    print(f"Average Validation Score: {res['avg_validation_score']:.4f} ± {res['std_validation_score']:.4f}")
    print(f"Average Test Score: {res['avg_test_score']:.4f} ± {res['std_test_score']:.4f}")


Running experiments for 20/80 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Running experiments for 50/50 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Running experiments for 80/20 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Results for 20/80 split:
Average Training Score: 0.8975 ± 0.0356
Average Validation Score: 0.8679 ± 0.0154
Average Test Score: 0.8378 ± 0.0217

Results for 50/50 split:
Average Training Score: 0.9596 ± 0.0109
Average Validation Score: 0.8876 ± 0.0116
Average Test Score: 0.8848 ± 0.0156

Results for 80/20 split:
Average Training Score: 0.9559 ± 0.0120
Average Validation Score: 0.9020 ± 0.0072
Average Test Score: 0.8807 ± 0.0119
