<a href="https://colab.research.google.com/github/BaoNguyen151654/Phising-URL-detection-model/blob/main/phising_link_detection_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install tldextract



In [2]:
!pip install scikit-optimize



In [3]:
import pandas as pd
import numpy as np
import tldextract
import re
from urllib.parse import urlparse, parse_qs
from sklearn.metrics import classification_report

df=pd.read_csv('/content/phishing_site_urls.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549846 entries, 0 to 549845
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549846 non-null  object
 1   Label   549846 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB


In [4]:
df['URL length'] = df['URL'].str.len()
df['URL length'].head()

Unnamed: 0,URL length
0,225
1,81
2,177
3,60
4,116


In [5]:
def count_subdomains(url):
    try:
        extracted = tldextract.extract(url)
        if extracted.subdomain:
            return len(extracted.subdomain.split('.'))
        else:
            return 0
    except:
        return 0

df['Number of Subdomain'] = df['URL'].apply(count_subdomains)
df['Number of Subdomain'].head()

Unnamed: 0,Number of Subdomain
0,0
1,1
2,0
3,1
4,0


In [6]:
def count_dots_in_url(url):
    return url.count('.')
df['Number of dots in the URL'] = df['URL'].apply(count_dots_in_url)
df['Number of dots in the URL'].head()

Unnamed: 0,Number of dots in the URL
0,6
1,5
2,7
3,6
4,1


In [7]:
def count_subdirectories(url):
    parsed_url = urlparse(url)
    path = parsed_url.path.strip('/')
    if not path:
        return 0
    return path.count('/') + 1
df['Number of sub directories'] = df['URL'].apply(count_subdirectories)
df['Number of sub directories'].head()

Unnamed: 0,Number of sub directories
0,9
1,5
2,11
3,3
4,7


In [8]:
def count_url_arguments(url):
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    return len(query_params)
df['Number of argument'] = df['URL'].apply(count_url_arguments)
df['Number of argument'].head()

Unnamed: 0,Number of argument
0,4
1,0
2,0
3,0
4,0


In [9]:
def count_symbols(url):
    return url.count('@') + url.count('-')

df['Number of Phishing Symbol'] = df['URL'].apply(count_symbols)
df['Number of Phishing Symbol'].head()

Unnamed: 0,Number of Phishing Symbol
0,4
1,2
2,1
3,0
4,1


In [10]:
def domain_length(url):
    ext = tldextract.extract(url)
    domain = ext.domain
    return len(domain)

df['Domain length'] = df['URL'].apply(domain_length)
df['Domain length'].head()

Unnamed: 0,Domain length
0,6
1,7
2,12
3,9
4,15


In [11]:
def count_delimiters(url):
    delimiters = r'[/:?#\[\]@!$&\'()*+,;=]'
    return len(re.findall(delimiters, url))
df['Total delimeter in URL'] = df['URL'].apply(count_delimiters)
df[['URL', 'Total delimeter in URL']].head()

Unnamed: 0,URL,Total delimeter in URL
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,18
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,7
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,11
3,mail.printakid.com/www.online.americanexpress....,2
4,thewhiskeydregs.com/wp-content/themes/widescre...,11


In [12]:
df.head()

Unnamed: 0,URL,Label,URL length,Number of Subdomain,Number of dots in the URL,Number of sub directories,Number of argument,Number of Phishing Symbol,Domain length,Total delimeter in URL
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,225,0,6,9,4,4,6,18
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,81,1,5,5,0,2,7,7
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,177,0,7,11,0,1,12,11
3,mail.printakid.com/www.online.americanexpress....,bad,60,1,6,3,0,0,9,2
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,116,0,1,7,0,1,15,11


In [13]:
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
good,393424
bad,156422


In [14]:
df_good = df[df['Label'] == 'good'].sample(n=156422, random_state=42)
df_bad = df[df['Label'] == 'bad'].sample(n=156422, random_state=42)
df = pd.concat([df_good, df_bad])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
bad,156422
good,156422


In [None]:
bad_samples = df[df['Label'] == 'bad']
good_samples = df[df['Label'] == 'good']

num_bad_samples = int(50000 * 0.6)
num_good_samples = int(50000 * 0.4)

bad_sampled = bad_samples.sample(n=num_bad_samples, random_state=42)
good_sampled = good_samples.sample(n=num_good_samples, random_state=42)

df = pd.concat([bad_sampled, good_sampled])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df

Unnamed: 0,URL,Label,URL length,Number of Subdomain,Number of dots in the URL,Number of sub directories,Number of argument,Number of Phishing Symbol,Domain length,Total delimeter in URL
0,jamaicaobserver.com/news/Jamaican-restaurant-a...,good,64,0,1,3,0,4,15,2
1,76.74.242.140/~nonni416/19o5f,bad,29,0,3,3,0,0,13,2
2,216.254.231.11/img2212.png,bad,26,0,4,2,0,0,14,1
3,www.mariefrancepochna.com/qqa6v,bad,31,1,2,2,0,0,17,1
4,legacy.com/obituaries/startribune/obituary.asp...,good,77,0,2,4,2,2,6,7
...,...,...,...,...,...,...,...,...,...,...
49995,pzhgp.freehost.pl/fp3vf6b,bad,25,1,2,2,0,0,8,1
49996,songkick.com/artists/429451-terence-blanchard,good,45,0,1,3,0,2,8,2
49997,people.mills.edu/,good,17,1,2,1,0,0,5,1
49998,arvindudyog.com/bright/bright/drake/bright/690...,bad,76,0,1,6,0,0,11,6


In [41]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from skopt.space import Integer
from sklearn.preprocessing import LabelEncoder

In [17]:
scaler = StandardScaler()
x=df.drop(['Label','URL'], axis=1).values
x= scaler.fit_transform(x)
y=df['Label'].values

In [18]:
x_trainset, x_testset, y_trainset, y_testset = train_test_split(x, y, test_size=0.3, random_state=42)

In [42]:
label_encoder = LabelEncoder()

y_trainset = label_encoder.fit_transform(y_trainset)
y_testset = label_encoder.transform(y_testset)

In [43]:
search_spaces = {'max_depth': Integer(1, 20)}
model = xgb.XGBClassifier(use_label_encoder=True, eval_metric='logloss')
bayes_search = BayesSearchCV(model, search_spaces, n_iter=50, cv=2, n_jobs=-1, verbose=0)
bayes_search.fit(x_trainset, y_trainset)
best_max_depth= bayes_search.best_params_['max_depth']
print(f"Best max_depth: {best_max_depth}")

Parameters: { "use_label_encoder" } are not used.



Best max_depth: 15


In [44]:
#model = RandomForestClassifier(n_estimators=100, max_depth=best_max_depth, random_state=42)
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=100,
    learning_rate=0.05,
    max_depth=best_max_depth,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    scale_pos_weight=1
)
model.fit(x_trainset, y_trainset)
y_pred = model.predict(x_testset)

Parameters: { "use_label_encoder" } are not used.



In [45]:
model

In [26]:
model_metrics=classification_report(y_testset, y_pred)
print(model_metrics)

              precision    recall  f1-score   support

         bad       0.86      0.85      0.86     46797
        good       0.86      0.86      0.86     47057

    accuracy                           0.86     93854
   macro avg       0.86      0.86      0.86     93854
weighted avg       0.86      0.86      0.86     93854



In [36]:
def get_url_length(url):
    return len(url)

def get_number_of_subdomains(url):
    extract = tldextract.extract(url)
    return len(extract.subdomain.split('.')) if extract.subdomain else 0

def get_number_of_dots(url):
    return url.count('.')

def get_number_of_subdirectories(url):
    path = urlparse(url).path
    return len(path.split('/')) - 1 if path else 0

def get_number_of_arguments(url):
    return len(urlparse(url).query.split('&')) if urlparse(url).query else 0

def get_number_of_phishing_symbols(url):
    return url.count('@') + url.count('-')

def get_domain_length(url):
    extract = tldextract.extract(url)
    return len(extract.domain) if extract.domain else 0

def count_delimiters(url):
    return url.count('/') + url.count('?') + url.count('&') + url.count('=')

def extract_features(url):
    features = {
        'URL length': get_url_length(url),
        'Number of Subdomains': get_number_of_subdomains(url),
        'Number of dots in the URL': get_number_of_dots(url),
        'Number of sub directories': get_number_of_subdirectories(url),
        'Number of arguments': get_number_of_arguments(url),
        'Number of Phishing Symbol': get_number_of_phishing_symbols(url),
        'Domain Length': get_domain_length(url),
        'Total delimiter in URL': count_delimiters(url)
    }
    return features

url = input("Enter URL: ")
features = extract_features(url)
features_df = pd.DataFrame([features])
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_df)
prediction = model.predict(features_scaled)
print("Prediction: ", "bad" if prediction[0] == 1 else "good")

Enter URL: https://experience.elluciancloud.com/rmup/
Prediction:  good
