In [5]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold
from statistics import mean, stdev
from scipy.io import arff

In [6]:

# Load and prepare the data
data, meta = arff.loadarff('PhishingData.arff')  # Fixed 'ata' to 'data'
df = pd.DataFrame(data)

for column in df.columns:
    df[column] = df[column].astype(str).str.replace("b'", "").str.replace("'", "").astype(int)
    
# Display first few rows and basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

Dataset Shape: (1353, 10)

First few rows:
   SFH  popUpWidnow  SSLfinal_State  Request_URL  URL_of_Anchor  web_traffic  \
0    1           -1               1           -1             -1            1   
1   -1           -1              -1           -1             -1            0   
2    1           -1               0            0             -1            0   
3    1            0               1           -1             -1            0   
4   -1           -1               1           -1              0            0   

   URL_Length  age_of_domain  having_IP_Address  Result  
0           1              1                  0       0  
1           1              1                  1       1  
2          -1              1                  0       1  
3           1              1                  0       0  
4          -1              1                  0       1  


In [7]:
df.dtypes


SFH                  int64
popUpWidnow          int64
SSLfinal_State       int64
Request_URL          int64
URL_of_Anchor        int64
web_traffic          int64
URL_Length           int64
age_of_domain        int64
having_IP_Address    int64
Result               int64
dtype: object

In [8]:
# Cell [3] - Prepare Features and Target
# Prepare features and target
X = df.drop('Result', axis=1)
y = df['Result']

# Define partition ratios
partition_ratios = [
    (0.2, 0.8),  # 20/80
    (0.5, 0.5),  # 50/50
    (0.8, 0.2)   # 80/20
]

# Define hyperparameter search space
param_grid = {
    'n_neighbors': list(range(1, 31, 2))  # Testing odd numbers from 1 to 30
}

In [9]:
# Cell [4] - Define Trial Function
def run_trial(X, y, train_size, random_state):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, random_state=random_state)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    # Find best k using GridSearchCV
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(
        knn,
        param_grid,
        cv=kf,
        scoring='accuracy',
        n_jobs=-1
    )
    
    # Fit the model
    grid_search.fit(X_train_scaled, y_train)
    
    # Get best model
    best_knn = grid_search.best_estimator_
    
    # Get scores
    train_score = best_knn.score(X_train_scaled, y_train)
    test_score = best_knn.score(X_test_scaled, y_test)
    val_scores = cross_val_score(best_knn, X_train_scaled, y_train, cv=kf)
    
    return {
        'train_score': train_score,
        'validation_score': mean(val_scores),
        'test_score': test_score,
        'best_params': grid_search.best_params_
    }


In [10]:
# Cell [5] - Run Experiments
# Run experiments for each partition ratio
results = {}
for train_size, test_size in partition_ratios:
    ratio_results = []
    print(f"\nRunning experiments for {train_size*100:.0f}/{test_size*100:.0f} split")
    
    for trial in range(3):
        print(f"Trial {trial + 1}/3...")
        trial_results = run_trial(X, y, train_size, random_state=42+trial)
        ratio_results.append(trial_results)
    
    # Calculate averages
    avg_results = {
        'avg_train_score': mean([r['train_score'] for r in ratio_results]),
        'avg_validation_score': mean([r['validation_score'] for r in ratio_results]),
        'avg_test_score': mean([r['test_score'] for r in ratio_results]),
        'std_train_score': stdev([r['train_score'] for r in ratio_results]),
        'std_validation_score': stdev([r['validation_score'] for r in ratio_results]),
        'std_test_score': stdev([r['test_score'] for r in ratio_results]),
        'individual_trials': ratio_results
    }
    
    results[f"{train_size*100:.0f}/{test_size*100:.0f}"] = avg_results


Running experiments for 20/80 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Running experiments for 50/50 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Running experiments for 80/20 split
Trial 1/3...
Trial 2/3...
Trial 3/3...


In [11]:
# Cell [6] - Print Results
# Print results
for split, res in results.items():
    print(f"\nResults for {split} split:")
    print(f"Average Training Score: {res['avg_train_score']:.4f} ± {res['std_train_score']:.4f}")
    print(f"Average Validation Score: {res['avg_validation_score']:.4f} ± {res['std_validation_score']:.4f}")
    print(f"Average Test Score: {res['avg_test_score']:.4f} ± {res['std_test_score']:.4f}")
    
    # Print best parameters for each trial
    print("\nBest parameters for each trial:")
    for i, trial in enumerate(res['individual_trials']):
        print(f"Trial {i+1}: {trial['best_params']}")


Results for 20/80 split:
Average Training Score: 0.8765 ± 0.0154
Average Validation Score: 0.8333 ± 0.0259
Average Test Score: 0.8101 ± 0.0208

Best parameters for each trial:
Trial 1: {'n_neighbors': 5}
Trial 2: {'n_neighbors': 3}
Trial 3: {'n_neighbors': 3}

Results for 50/50 split:
Average Training Score: 0.8733 ± 0.0104
Average Validation Score: 0.8496 ± 0.0074
Average Test Score: 0.8282 ± 0.0332

Best parameters for each trial:
Trial 1: {'n_neighbors': 5}
Trial 2: {'n_neighbors': 9}
Trial 3: {'n_neighbors': 11}

Results for 80/20 split:
Average Training Score: 0.8891 ± 0.0107
Average Validation Score: 0.8558 ± 0.0037
Average Test Score: 0.8413 ± 0.0224

Best parameters for each trial:
Trial 1: {'n_neighbors': 5}
Trial 2: {'n_neighbors': 9}
Trial 3: {'n_neighbors': 7}
