In [1]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV, KFold
from statistics import mean, stdev
from scipy.stats import randint


# For reading .arff file
from scipy.io import arff

In [2]:
# Load the ARFF file
data, meta = arff.loadarff('Training Dataset.arff')

# Convert to pandas DataFrame
df = pd.DataFrame(data)

for column in df.columns:
    df[column] = df[column].astype(str).str.replace("b'", "").str.replace("'", "").astype(int)
    
# Display first few rows and basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()




Dataset Shape: (11055, 31)

First few rows:


Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [3]:
df.dtypes


having_IP_Address              int64
URL_Length                     int64
Shortining_Service             int64
having_At_Symbol               int64
double_slash_redirecting       int64
Prefix_Suffix                  int64
having_Sub_Domain              int64
SSLfinal_State                 int64
Domain_registeration_length    int64
Favicon                        int64
port                           int64
HTTPS_token                    int64
Request_URL                    int64
URL_of_Anchor                  int64
Links_in_tags                  int64
SFH                            int64
Submitting_to_email            int64
Abnormal_URL                   int64
Redirect                       int64
on_mouseover                   int64
RightClick                     int64
popUpWidnow                    int64
Iframe                         int64
age_of_domain                  int64
DNSRecord                      int64
web_traffic                    int64
Page_Rank                      int64
G

In [4]:
print("Number of NaN values in Result column:", df['Result'].isna().sum())
print("Total NaN values in dataset:", df.isna().sum().sum())


Number of NaN values in Result column: 0
Total NaN values in dataset: 0


In [5]:
X = df.drop('Result', axis=1)  # Assuming 'Result' is the target column
y = df['Result']  # Remove the mapping since data is already numeric

In [6]:
# Define partition ratios and hyperparameter search space
partition_ratios = [
    (0.2, 0.8),  # 20/80
    (0.5, 0.5),  # 50/50
    (0.8, 0.2)   # 80/20
]

param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5)
}

In [7]:
def run_trial(X, y, train_size, random_state):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, random_state=random_state)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    # Perform RandomizedSearchCV
    rf = RandomForestClassifier(random_state=random_state)
    rf_random = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=20,
        cv=kf,
        random_state=random_state,
        n_jobs=-1
    )
    
    # Fit the model
    rf_random.fit(X_train_scaled, y_train)
    
    # Get predictions for test set
    y_pred = rf_random.predict(X_test_scaled)
    
    # Get scores
    train_score = rf_random.score(X_train_scaled, y_train)
    test_score = rf_random.score(X_test_scaled, y_test)
    val_scores = cross_val_score(rf_random.best_estimator_, X_train_scaled, y_train, cv=kf)
    
    # Generate classification report
    class_report = classification_report(y_test, y_pred)
    
    return {
        'train_score': train_score,
        'validation_score': mean(val_scores),
        'test_score': test_score,
        'best_params': rf_random.best_params_,
        'classification_report': class_report
    }

In [8]:
# Run experiments for each partition ratio
results = {}
for train_size, test_size in partition_ratios:
    ratio_results = []
    print(f"\nRunning experiments for {train_size*100:.0f}/{test_size*100:.0f} split")
    
    for trial in range(3):
        print(f"Trial {trial + 1}/3...")
        trial_results = run_trial(X, y, train_size, random_state=42+trial)
        ratio_results.append(trial_results)
    
    # Calculate averages
    avg_results = {
        'avg_train_score': mean([r['train_score'] for r in ratio_results]),
        'avg_validation_score': mean([r['validation_score'] for r in ratio_results]),
        'avg_test_score': mean([r['test_score'] for r in ratio_results]),
        'std_train_score': stdev([r['train_score'] for r in ratio_results]),
        'std_validation_score': stdev([r['validation_score'] for r in ratio_results]),
        'std_test_score': stdev([r['test_score'] for r in ratio_results]),
        'individual_trials': ratio_results
    }
    
    results[f"{train_size*100:.0f}/{test_size*100:.0f}"] = avg_results

# Print detailed results
print("\n" + "="*50)
print("RESULTS BY SPLIT RATIO")
print("="*50)

for split, res in results.items():
    print(f"\nResults for {split} split:")
    print(f"Average Training Score: {res['avg_train_score']:.4f} ± {res['std_train_score']:.4f}")
    print(f"Average Validation Score: {res['avg_validation_score']:.4f} ± {res['std_validation_score']:.4f}")
    print(f"Average Test Score: {res['avg_test_score']:.4f} ± {res['std_test_score']:.4f}")
    
    # Print best parameters for each trial
    print("\nBest parameters for each trial:")
    for i, trial in enumerate(res['individual_trials']):
        print(f"Trial {i+1}: {trial['best_params']}")
        
    # Print classification report for the last trial
    print("\nDetailed Classification Report (last trial):")
    print(trial['classification_report'])

# Print overall averages
overall_averages = {
    'train_score': mean([res['avg_train_score'] for res in results.values()]),
    'validation_score': mean([res['avg_validation_score'] for res in results.values()]),
    'test_score': mean([res['avg_test_score'] for res in results.values()]),
    'train_std': mean([res['std_train_score'] for res in results.values()]),
    'validation_std': mean([res['std_validation_score'] for res in results.values()]),
    'test_std': mean([res['std_test_score'] for res in results.values()])
}

print("\n" + "="*50)
print("OVERALL AVERAGES ACROSS ALL SPLITS")
print("="*50)
print(f"Average Training Score: {overall_averages['train_score']:.4f} ± {overall_averages['train_std']:.4f}")
print(f"Average Validation Score: {overall_averages['validation_score']:.4f} ± {overall_averages['validation_std']:.4f}")
print(f"Average Test Score: {overall_averages['test_score']:.4f} ± {overall_averages['test_std']:.4f}")

# Print summary of best parameters
print("\nMost common best parameters across all trials:")
all_params = [trial['best_params'] 
              for res in results.values() 
              for trial in res['individual_trials']]

# Count parameter frequencies
param_counts = {}
for params in all_params:
    param_str = str(params)
    param_counts[param_str] = param_counts.get(param_str, 0) + 1

# Print the most common parameters
most_common = max(param_counts.items(), key=lambda x: x[1])
print(f"Most frequent parameters: {most_common[0]}")
print(f"Frequency: {most_common[1]}/{len(all_params)} trials")


Running experiments for 20/80 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Running experiments for 50/50 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

Running experiments for 80/20 split
Trial 1/3...
Trial 2/3...
Trial 3/3...

RESULTS BY SPLIT RATIO

Results for 20/80 split:
Average Training Score: 0.9843 ± 0.0102
Average Validation Score: 0.9480 ± 0.0067
Average Test Score: 0.9509 ± 0.0015

Best parameters for each trial:
Trial 1: {'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 67}
Trial 2: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 123}
Trial 3: {'max_depth': 14, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 58}

Detailed Classification Report (last trial):
              precision    recall  f1-score   support

          -1       0.96      0.93      0.95      3947
           1       0.94      0.97      0.96      4897

    accuracy                           0.95      8844
   macro avg       0.95 