# Dataset Preparation


### Setup

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 

warnings.filterwarnings('ignore')

seed = 1855

## Data Collection


In [61]:
splits = {'train': 'data/train.parquet', 'test': 'data/test.parquet'}
df = pd.read_parquet("hf://datasets/pirocheto/phishing-url/" + splits["train"])

df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,https://www.todayshomeowner.com/how-to-make-ho...,82,23,0,2,7,0,0,0,0,...,1,1,0,240,8892,67860,0,1,4,legitimate
1,http://thapthan.ac.th/information/confirmation...,93,14,1,2,0,0,0,0,0,...,1,0,1,0,2996,4189860,0,1,2,phishing
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,121,21,1,3,0,0,0,0,0,...,1,1,0,30,2527,346022,0,1,3,phishing
3,https://www.bedslide.com,24,16,0,2,0,0,0,0,0,...,0,0,0,139,7531,1059151,0,0,4,legitimate
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,73,24,0,3,1,0,0,0,0,...,0,0,0,3002,7590,635,0,1,5,legitimate


## Handling missing values


In [62]:
# Checking for missing values

df.value_counts().isnull().sum()
# dropping the url as all it's info is in the other columns

df = df.drop(columns=['url'])

## Encoding categorical features with One-Hot Encoding


In [63]:
# the only categorical column is the label
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns].value_counts()

status    
legitimate    3829
phishing      3829
Name: count, dtype: int64

## Encoding binary class label


In [64]:
df.status = df.status.map(lambda x: 1 if x == 'phishing' else 0)


df.status.value_counts()


status
0    3829
1    3829
Name: count, dtype: int64

# Model Building


In [65]:
from sklearn.model_selection import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.svm import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.ensemble import *

## Splitting the dataset into training and testing sets


In [66]:
x = df.drop(columns=['status'])
y = df.status

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

## Evaluation function


In [67]:
def evaluate_classification(expected, predicted):
  scores = {
    'Accuracy': accuracy_score(expected, predicted),
    'Precision': precision_score(expected, predicted),
    'Recall': recall_score(expected, predicted),
    'F1': f1_score(expected, predicted),
    'ROC AUC': roc_auc_score(expected, predicted)
  }
  
  for name, score in scores.items():
    print(f'{name}: {score:.2f}')

## Model Building


In [68]:
model = KNeighborsClassifier()

## Model Evaluation


### Cross Validation

In [69]:
cv = 10
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

results = cross_validate(model, x_train, y_train, cv=cv, scoring=scoring)

for metric in scoring:
  print(f'{metric.capitalize()}: {results[f"test_{metric}"].mean():.2f} +- {results[f"test_{metric}"].std():.2f}')


Accuracy: 0.82 +- 0.01
Precision: 0.81 +- 0.02
Recall: 0.83 +- 0.03
F1: 0.82 +- 0.01
Roc_auc: 0.89 +- 0.01


### Cross Validation with KFold

In [70]:
cv = KFold(n_splits=10, shuffle=True, random_state=seed)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']


results = cross_validate(model, x_train, y_train, cv=cv, scoring=scoring)

for metric in scoring:
  print(f'{metric.capitalize()}: {results[f"test_{metric}"].mean():.2f} +- {results[f"test_{metric}"].std():.2f}')

Accuracy: 0.82 +- 0.02
Precision: 0.81 +- 0.03
Recall: 0.83 +- 0.02
F1: 0.82 +- 0.02
Roc_auc: 0.89 +- 0.02


### Cross Validation with StratifiedKFold

In [71]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

results = cross_validate(model, x_train, y_train, cv=cv, scoring=scoring)

for metric in scoring:
  print(f'{metric.capitalize()}: {results[f"test_{metric}"].mean():.2f} +- {results[f"test_{metric}"].std():.2f}')

Accuracy: 0.82 +- 0.02
Precision: 0.80 +- 0.01
Recall: 0.83 +- 0.02
F1: 0.82 +- 0.02
Roc_auc: 0.89 +- 0.01


## Model Building with Hyperparameter Tuning


In [72]:
model = KNeighborsClassifier()
params = {
  'n_neighbors': [3, 5, 7, 9, 11],
  'weights': ['uniform', 'distance'],
  'metric' : ['euclidean', 'manhattan', 'minkowski']
}

grid_search = GridSearchCV(model, param_grid=params, cv=cv, scoring='f1')

grid_search.fit(x_train, y_train)

print(grid_search.best_params_)


## Model Evaluation


In [None]:
best_model = grid_search.best_estimator_

best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

evaluate_classification(y_test, y_pred)

Accuracy: 0.86
Precision: 0.85
Recall: 0.88
F1: 0.86
ROC AUC: 0.86


# Comparing Different Models Performance

## Model Building

In [None]:
models = {
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
}

## Model Comparison

In [None]:
results = {}
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for name, model in models.items():
    results[name] = cross_val_score(model, x_train, y_train, cv=k_fold, scoring='accuracy')


result_df = pd.DataFrame(results).transpose()

result_df['mean'] = result_df.mean(axis=1)
result_df['std'] = result_df.std(axis=1)
result_df = result_df.sort_values(by=['mean', 'std'], ascending=False)
result_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std
RandomForestClassifier,0.977162,0.960848,0.952692,0.965742,0.957586,0.959217,0.95915,0.95915,0.972222,0.960784,0.962455,0.006923
GradientBoostingClassifier,0.959217,0.952692,0.954323,0.965742,0.952692,0.947798,0.965686,0.952614,0.960784,0.949346,0.956089,0.00608
AdaBoostClassifier,0.955954,0.933116,0.941272,0.946166,0.942904,0.939641,0.946078,0.934641,0.95098,0.934641,0.942539,0.007075
DecisionTreeClassifier,0.936378,0.933116,0.923328,0.928222,0.931485,0.928222,0.933007,0.933007,0.924837,0.918301,0.92899,0.005243
KNeighborsClassifier,0.830343,0.769984,0.823817,0.823817,0.820555,0.812398,0.820261,0.813725,0.818627,0.816993,0.815052,0.015816
SVC,0.616639,0.603589,0.636215,0.595432,0.611746,0.593801,0.598039,0.593137,0.611111,0.581699,0.604141,0.014616
