In [1]:
!pip install pandas numpy tldextract scikit-learn
!pip install onnxruntime



In [2]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import re

import tldextract

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV


from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx


In [3]:
# loads csv files into legitmate and phishing dataframes
url_1 = pd.read_csv('Phishing URLs.csv')

url_2 = pd.read_csv('URL dataset.csv')

# Combine both DataFrames by rows (stacking them)
url_df = pd.concat([url_1, url_2], ignore_index=True)
print(url_df.head())

                                                 url      Type
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing
2                        https://kq0hgp.webwave.dev/  Phishing
3  https://brittishtele1bt-69836.getresponsesite....  Phishing
4         https://bt-internet-105056.weeblysite.com/  Phishing


In [4]:
# Calculates the length of each URL
url_df['URL_len'] = url_df['url'].str.len()

print(url_df.head())

                                                 url      Type  URL_len
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing       47
2                        https://kq0hgp.webwave.dev/  Phishing       27
3  https://brittishtele1bt-69836.getresponsesite....  Phishing       50
4         https://bt-internet-105056.weeblysite.com/  Phishing       42


In [5]:
# Calculates the length of hostname
url_df['Hostname_len'] = url_df['url'].apply(lambda x: len(urlparse(x).netloc))

print(url_df.head())


                                                 url      Type  URL_len  \
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178   
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing       47   
2                        https://kq0hgp.webwave.dev/  Phishing       27   
3  https://brittishtele1bt-69836.getresponsesite....  Phishing       50   
4         https://bt-internet-105056.weeblysite.com/  Phishing       42   

   Hostname_len  
0            15  
1            38  
2            18  
3            41  
4            33  


In [6]:
# Finds the number of subdomains
url_df['Hostname'] = url_df['url'].apply(lambda x: urlparse(x).netloc)

print(url_df.head())

url_df['num_subdomain'] = url_df['Hostname'].apply(lambda x: x.count('.')-1)  # -1 to not include the '.' in the main domain
print(url_df.head())


                                                 url      Type  URL_len  \
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178   
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing       47   
2                        https://kq0hgp.webwave.dev/  Phishing       27   
3  https://brittishtele1bt-69836.getresponsesite....  Phishing       50   
4         https://bt-internet-105056.weeblysite.com/  Phishing       42   

   Hostname_len                                   Hostname  
0            15                            docs.google.com  
1            38     btttelecommunniccatiion.weeblysite.com  
2            18                         kq0hgp.webwave.dev  
3            41  brittishtele1bt-69836.getresponsesite.com  
4            33          bt-internet-105056.weeblysite.com  
                                                 url      Type  URL_len  \
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178   
1    https://btttelecommunniccati

In [7]:
#Number of special characters

def extract_alpha(string):
    alpha = []

    for char in string:
        if char.isalpha():
            alpha.append(char)
    return len(alpha)


def extract_nums(string):
    nums = []

    for char in string:
        if char.isnumeric():
            nums.append(char)

    return len(nums)

url_df['num_special_char'] = url_df['URL_len']  - url_df['url'].apply(lambda x: extract_alpha(x) + extract_nums(x)) 

print(url_df.head())

                                                 url      Type  URL_len  \
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178   
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing       47   
2                        https://kq0hgp.webwave.dev/  Phishing       27   
3  https://brittishtele1bt-69836.getresponsesite....  Phishing       50   
4         https://bt-internet-105056.weeblysite.com/  Phishing       42   

   Hostname_len                                   Hostname  num_subdomain  \
0            15                            docs.google.com              1   
1            38     btttelecommunniccatiion.weeblysite.com              1   
2            18                         kq0hgp.webwave.dev              1   
3            41  brittishtele1bt-69836.getresponsesite.com              1   
4            33          bt-internet-105056.weeblysite.com              1   

   num_special_char  
0                23  
1                 6  
2                 6 

In [8]:
#Number of digits
def extract_nums(string):
    nums = []

    for char in string:
        if char.isnumeric():
            nums.append(char)

    return len(nums)

url_df['num_digit'] = url_df['url'].apply(lambda x:extract_nums(x)) 

print(url_df.head())


                                                 url      Type  URL_len  \
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178   
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing       47   
2                        https://kq0hgp.webwave.dev/  Phishing       27   
3  https://brittishtele1bt-69836.getresponsesite....  Phishing       50   
4         https://bt-internet-105056.weeblysite.com/  Phishing       42   

   Hostname_len                                   Hostname  num_subdomain  \
0            15                            docs.google.com              1   
1            38     btttelecommunniccatiion.weeblysite.com              1   
2            18                         kq0hgp.webwave.dev              1   
3            41  brittishtele1bt-69836.getresponsesite.com              1   
4            33          bt-internet-105056.weeblysite.com              1   

   num_special_char  num_digit  
0                23         20  
1                 6 

In [9]:
#Check for https
url_df['https'] = url_df['url'].apply(lambda x: 1 if urlparse(x).scheme == 'https' else 0)

print(url_df.head())

                                                 url      Type  URL_len  \
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178   
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing       47   
2                        https://kq0hgp.webwave.dev/  Phishing       27   
3  https://brittishtele1bt-69836.getresponsesite....  Phishing       50   
4         https://bt-internet-105056.weeblysite.com/  Phishing       42   

   Hostname_len                                   Hostname  num_subdomain  \
0            15                            docs.google.com              1   
1            38     btttelecommunniccatiion.weeblysite.com              1   
2            18                         kq0hgp.webwave.dev              1   
3            41  brittishtele1bt-69836.getresponsesite.com              1   
4            33          bt-internet-105056.weeblysite.com              1   

   num_special_char  num_digit  https  
0                23         20      1  
1     

In [10]:
#Check for url with IP address
ipv4_pattern = r'^\d{1,3}(\.\d{1,3}){3}$'
ipv6_pattern = r'^[0-9a-fA-F:]+$'

def check_ip(string):
    hostname = urlparse(string).netloc

    if (re.match(ipv4_pattern, hostname) or (re.match(ipv6_pattern, hostname)) ):
        return 1
    else:
        return 0

url_df['ip_address'] = url_df['url'].apply(lambda x: check_ip(x))

print(url_df.head())



                                                 url      Type  URL_len  \
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178   
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing       47   
2                        https://kq0hgp.webwave.dev/  Phishing       27   
3  https://brittishtele1bt-69836.getresponsesite....  Phishing       50   
4         https://bt-internet-105056.weeblysite.com/  Phishing       42   

   Hostname_len                                   Hostname  num_subdomain  \
0            15                            docs.google.com              1   
1            38     btttelecommunniccatiion.weeblysite.com              1   
2            18                         kq0hgp.webwave.dev              1   
3            41  brittishtele1bt-69836.getresponsesite.com              1   
4            33          bt-internet-105056.weeblysite.com              1   

   num_special_char  num_digit  https  ip_address  
0                23         20    

In [11]:
# obtain tld
def check_tld(string):
    extracted_tld = tldextract.extract(string).suffix
    
    return extracted_tld

url_df['tld'] = url_df['url'].apply(lambda x: check_tld(x))
print(url_df.head())


                                                 url      Type  URL_len  \
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178   
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing       47   
2                        https://kq0hgp.webwave.dev/  Phishing       27   
3  https://brittishtele1bt-69836.getresponsesite....  Phishing       50   
4         https://bt-internet-105056.weeblysite.com/  Phishing       42   

   Hostname_len                                   Hostname  num_subdomain  \
0            15                            docs.google.com              1   
1            38     btttelecommunniccatiion.weeblysite.com              1   
2            18                         kq0hgp.webwave.dev              1   
3            41  brittishtele1bt-69836.getresponsesite.com              1   
4            33          bt-internet-105056.weeblysite.com              1   

   num_special_char  num_digit  https  ip_address  tld  
0                23         2

In [12]:
#Common tld
def check_legit_tld(string):
    extracted_tld = tldextract.extract(string).suffix
    
    legit_tlds = ['com', 'org', 'edu', 'gov', 'net', 'int', 'mil', 'uk', 'dev']

    if extracted_tld in legit_tlds:
        return 1
    else:
        return 0
    

url_df['common_tld'] = url_df['url'].apply(lambda x: check_legit_tld(x))
print(url_df.head())



                                                 url      Type  URL_len  \
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178   
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing       47   
2                        https://kq0hgp.webwave.dev/  Phishing       27   
3  https://brittishtele1bt-69836.getresponsesite....  Phishing       50   
4         https://bt-internet-105056.weeblysite.com/  Phishing       42   

   Hostname_len                                   Hostname  num_subdomain  \
0            15                            docs.google.com              1   
1            38     btttelecommunniccatiion.weeblysite.com              1   
2            18                         kq0hgp.webwave.dev              1   
3            41  brittishtele1bt-69836.getresponsesite.com              1   
4            33          bt-internet-105056.weeblysite.com              1   

   num_special_char  num_digit  https  ip_address  tld  common_tld  
0                

In [13]:
# Applying one hot encoding - legitmate url (1) and phishing (0)
def check_url_real_or_fake(string):
    if (string == "legitimate"):
        return 1
    else: 
        return 0


url_df['legit_url'] = url_df['Type'].apply(lambda x: check_url_real_or_fake(x))
print(url_df.head())

# Seperate input and output
y = url_df['legit_url']
print(y)
url_df = url_df.drop('legit_url', axis=1)
print(url_df.head())

                                                 url      Type  URL_len  \
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing      178   
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing       47   
2                        https://kq0hgp.webwave.dev/  Phishing       27   
3  https://brittishtele1bt-69836.getresponsesite....  Phishing       50   
4         https://bt-internet-105056.weeblysite.com/  Phishing       42   

   Hostname_len                                   Hostname  num_subdomain  \
0            15                            docs.google.com              1   
1            38     btttelecommunniccatiion.weeblysite.com              1   
2            18                         kq0hgp.webwave.dev              1   
3            41  brittishtele1bt-69836.getresponsesite.com              1   
4            33          bt-internet-105056.weeblysite.com              1   

   num_special_char  num_digit  https  ip_address  tld  common_tld  legit_url  
0     

In [14]:
#Maybe remove unwanted columns
url_df = url_df.drop('tld', axis=1)
url_df = url_df.drop('url', axis=1)
url_df = url_df.drop('Type', axis=1)
url_df = url_df.drop('Hostname', axis=1)
print(url_df.size)
url_df.to_csv("processed_dataset.csv", index=False)

4039864


In [15]:
x_train, x_test, y_train_labels, y_test_labels = train_test_split(url_df, y, test_size=0.6, random_state=42)
x_val, x_test, y_val_labels, y_test_labels = train_test_split(x_test,y_test_labels, test_size=0.5, random_state=42)

print(x_train.head())
print(x_test.head())
print(y_train_labels.head())
print(y_test_labels.head())

print(x_val.head())
print(x_test.head())
print(y_val_labels.head())
print(y_test_labels.head())

print(x_train.size)


        URL_len  Hostname_len  num_subdomain  num_special_char  num_digit  \
172743       52            26              2                 7          1   
225865       31            14              1                 7          0   
316939       72            17              1                13          0   
197159       57            16              1                12         10   
286257       96            14              1                15          9   

        https  ip_address  common_tld  
172743      1           0           1  
225865      1           0           1  
316939      1           0           1  
197159      1           0           1  
286257      1           0           1  
        URL_len  Hostname_len  num_subdomain  num_special_char  num_digit  \
422094       62            14              0                13         14   
284577       49            21              1                 8          8   
400482       29            20              1                 6    

In [16]:
dt = DecisionTreeClassifier(random_state= 42)
dt.fit(x_train, y_train_labels)

y_pred = dt.predict(x_test)
print(y_pred.shape)
print(y_test_labels.shape)

accuracy = accuracy_score(y_test_labels, y_pred)
precision = precision_score(y_test_labels, y_pred)
recall = recall_score(y_test_labels, y_pred)
f1 = f1_score(y_test_labels, y_pred)
print("Accuracy:", round(accuracy,2))
print("Precision:", round(precision,2))
print("Recall:", round(recall,2))
print("F1 Score:", round(f1,2))

(151495,)
(151495,)
Accuracy: 0.96
Precision: 0.97
Recall: 0.98
F1 Score: 0.97


In [17]:


param_grid = {
    'max_depth': [None, 5, 10, 20],             # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],            # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 5, 10],             # Minimum number of samples required to be at a leaf node
    'criterion': ['gini', 'entropy'],          # Function to measure the quality of a split
    'max_features': [None, 'sqrt', 'log2'],    # Number of features to consider for each split
}


dt = DecisionTreeClassifier(random_state= 42)

# Use random search to find the best hyperparameters
grid_search = GridSearchCV(dt, 
                            param_grid = param_grid,  
                            cv=3)

# Fit the random search object to the data
grid_search.fit(x_val, y_val_labels)

In [18]:
# Create a variable for the best model
best_rf = grid_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters for RF:',  grid_search.best_params_)

Best hyperparameters for RF: {'criterion': 'gini', 'max_depth': 20, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 2}


In [19]:
dt = DecisionTreeClassifier(random_state= 42, criterion='gini', max_depth = 20, max_features= None, min_samples_leaf=5, min_samples_split=2)
dt.fit(x_train, y_train_labels)

y_pred = dt.predict(x_test)

accuracy = accuracy_score(y_test_labels, y_pred)
precision = precision_score(y_test_labels, y_pred)
recall = recall_score(y_test_labels, y_pred)
f1 = f1_score(y_test_labels, y_pred)
print("Accuracy:", round(accuracy,2))
print("Precision:", round(precision,2))
print("Recall:", round(recall,2))
print("F1 Score:", round(f1,2))

#Confusion matrix
con_matrix = confusion_matrix(y_test_labels, y_pred) 
print(con_matrix)

Accuracy: 0.97
Precision: 0.97
Recall: 0.98
F1 Score: 0.98
[[ 44674   3427]
 [  1634 101760]]


In [20]:
print(url_df[:1])
#onx = to_onnx(rf, url_df[:1])  # Use a single sample to define input shape/type

initial_type = [('float_input', FloatTensorType([None, 8]))]
options = {id(dt): {'zipmap': False}}
# Convert model to ONNX
onx = to_onnx(dt, url_df.values.astype(np.float32), initial_types=initial_type, options=options)

# Save to file
with open("dt_phishing_model.onnx", "wb") as f:
    f.write(onx.SerializeToString())

   URL_len  Hostname_len  num_subdomain  num_special_char  num_digit  https  \
0      178            15              1                23         20      1   

   ip_address  common_tld  
0           0           1  
