In [64]:
#1-2. Web URL input and Feature Selection
print("Running web url input and Feature Selection")

import pandas as pd
import requests
from bs4 import BeautifulSoup
import whois
from datetime import datetime
import socket
import ssl
from urllib.parse import urlparse
import requests
from urllib3.exceptions import NewConnectionError, MaxRetryError
from requests.exceptions import ConnectionError
import time
import re

# Define headers to mimic a real browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Referer': 'https://google.com',  # Optional

}

# Fetch URL with retries
def fetch_url(url, retries=2):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx, 5xx)
            return response.content
        except (requests.exceptions.RequestException, NewConnectionError, MaxRetryError, ConnectionError):
            # Suppress error details and retry
            time.sleep(2)  # Delay between retries
    # If all retries fail, return None
    return None

                
#------------------------------------------------------------------------------------------------------------------
                                             #Domain-based Feature
# Function to check SSL certificate
def check_ssl(domain):
    try:
        context = ssl.create_default_context()
        with socket.create_connection((domain, 443)) as sock:
            with context.wrap_socket(sock, server_hostname=domain) as secure_sock:
                return True
    except:
        return False
#------------------------------------------------------------------------------------------------------------------
                                           

                                           
#-------------------------------------------------------------------------------------------------------------------
                                               #URL-based Feature
# Check for IP address in URL
def contains_ip(url):
    ip_pattern = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
    return bool(ip_pattern.search(url))

#--------------------------------------------------------------------------------------------------------------------
                                              #URL-based feature
# Compile the shortening services regex pattern
shortening_services_pattern = re.compile(r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|"
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|"
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|"
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|"
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|"
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|"
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|"
                      r"tr\.im|link\.zip\.net")

# Count special characters in URL
def count_special_chars(url):
    return sum(not c.isalnum() and c not in ['.', '-', '_', ':', '/', '?', '&', '=', '%'] for c in url)

# Check for URL shortening services using the provided regex pattern
def shortening_services(url):
    return bool(re.search(shortening_services_pattern, url))
#----------------------------------------------------------------------------------------------------------------------

# Extract features from URL
def extract_features(url):
    features = {}
    domain = urlparse(url).netloc
    #-----------------------------------------------------------------------------------------------------------------
                                              #URL-based features
    features['url_length'] = len(url)
    features['contains_ip'] = int(contains_ip(url))
    features['shortening_services'] = int(shortening_services(url))
    features['special_chars'] = count_special_chars(url)
    #-----------------------------------------------------------------------------------------------------------------
    content = fetch_url(url)
    if content:
        soup = BeautifulSoup(content, 'html.parser')
        #-------------------------------------------------------------------------------------------------------------
                                              #Content-based features
        features['html_length'] = len(content)
        features['js_length'] = sum(len(s.string) for s in soup.find_all('script') if s.string)
        features['num_links'] = len(soup.find_all('a'))
        features['num_forms'] = len(soup.find_all('form'))
        #-------------------------------------------------------------------------------------------------------------
    
        try:
            #---------------------------------------------------------------------------------------------------------
                                              #Domain-based features
            domain_info = whois.whois(domain)
            creation_date = domain_info.creation_date
            updated_date = domain_info.updated_date
            expiration_date = domain_info.expiration_date
            
            if isinstance(creation_date, list):
                creation_date = creation_date[0]
            if isinstance(updated_date, list):
                updated_date = updated_date[0]
            if isinstance(expiration_date, list):
                expiration_date = expiration_date[0]
            
            features['domain_age'] = (datetime.now() - creation_date).days if creation_date else None
        except Exception as e:
            print(f"Error fetching domain info for {domain}: {e}")
            features['domain_age'] = None

        features['has_ssl'] = 1 if check_ssl(domain) else 0
        #------------------------------------------------------------------------------------------------------------------

        features.update(features)
    else:
        features.update({
            'html_length': 0,
            'js_length': 0,
            'num_links': 0,
            'num_forms': 0,
            'contains_ip': 0,
            'shortening_service': 0,
            'url_length': len(url),
            'special_chars': sum(not c.isalnum() and c not in ['.', '-', '_'] for c in url),
            'num_subdomains': 0,
            'domain_age': None,
            'has_ssl': 0,
        })

    return features

Running web url input and Feature Extraction


In [66]:
                                              #3. Feature Vector
print("Extract features running")
# Function to process a single URL
def process_url(url):
    try:
        features = extract_features(url)
        return {'URL': url, **features}
    except Exception as e:
        print(f"Error processing URL: {url}. Error: {e}")
        return None

Extract features running


In [7]:
                             #5. Data Preprocessing - Randomly process 200 URLs for each label
import pandas as pd
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

print("Data preprocessing running")
def preprocess_data(df, processed_df, n=200):
    # Separate URLs by label (0 for legitimate, 1 for phishing)
    legitimate_urls = df[df['Label'] == 0]['URL'].tolist()
    phishing_urls = df[df['Label'] == 1]['URL'].tolist()
    
    # Filter out URLs that have already been processed
    already_processed_urls = set(processed_df['URL'].tolist()) if not processed_df.empty else set()
    legitimate_urls = [url for url in legitimate_urls if url not in already_processed_urls]
    phishing_urls = [url for url in phishing_urls if url not in already_processed_urls]

    # Randomly sample 200 URLs from each group (or fewer if not enough remain)
    legitimate_sample = random.sample(legitimate_urls, min(n, len(legitimate_urls)))
    phishing_sample = random.sample(phishing_urls, min(n, len(phishing_urls)))

    all_samples = legitimate_sample + phishing_sample
    results = []
    
    # Multithreading to process URLs faster
    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = {executor.submit(process_url, url): url for url in all_samples}
        for future in as_completed(futures):
            url = futures[future]
            try:
                result = future.result()
                if result:  # Only append if the URL is successfully processed
                    results.append(result)
                    print(f"Processed URL: {url}")
            except Exception as e:
                print(f"Error processing URL: {url}. Error: {e}")

    # Convert results to DataFrame
    X = pd.DataFrame(results)
    
    # Ensure the corresponding labels match the processed URLs
    y = df.loc[df['URL'].isin(X['URL']), 'Label'].values

    return X, y

Data preprocessing running


In [9]:
                                            #4. Historical Dataset
print("Loading Historical Dataset")
import pandas as pd

# Load datasets
legitimate_df = pd.read_csv('legitimate_urls.csv')
phishing_df = pd.read_csv('phishing_urls.csv')

# Add labels
legitimate_df['Label'] = 0  # 0 for legitimate
phishing_df['Label'] = 1    # 1 for phishing

# Prepend 'http://' to the domain names
legitimate_df['URL'] = 'http://' + legitimate_df['Domain']
phishing_df['URL'] = 'http://' + phishing_df['Domain']

# Combine datasets
df = pd.concat([legitimate_df, phishing_df], ignore_index=True)
df = df.drop_duplicates(subset='URL')  # Remove duplicates

# Assuming we have a dataframe 'processed_df' of already processed URLs, or create an empty one
processed_df = pd.DataFrame(columns=['URL'])

# Preprocess the data (sample 200 legitimate and 200 phishing URLs for feature extraction)
X, y = preprocess_data(df, processed_df, n=200)

# At this point, `X` contains the feature set and `y` contains the corresponding labels for training
print(f"Feature matrix shape: {X.shape}")
print(f"Labels shape: {len(y)}")
# X and y can now be used for model training

Loading Dataset
Error fetching content from http://sprint.com: 403 Client Error: Forbidden for url: http://sprint.com/
Retrying... (1/2)
Error fetching content from http://akhbarelyom.com: 403 Client Error: Forbidden for url: https://akhbarelyom.com/
Retrying... (1/2)
Error fetching content from http://fazenda.gov.br: HTTPConnectionPool(host='fazenda.gov.br', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x700347960dd0>: Failed to resolve 'fazenda.gov.br' ([Errno -5] No address associated with hostname)"))
Retrying... (1/2)
Error fetching content from http://fazenda.gov.br: HTTPConnectionPool(host='fazenda.gov.br', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x700347960920>: Failed to resolve 'fazenda.gov.br' ([Errno -5] No address associated with hostname)"))
Processed URL: http://fazenda.gov.br
Error fetching content from http://sprint.

2024-09-07 17:13:44,383 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno 111] Connection refused


Error fetching content from http://motthegioi.vn: HTTPConnectionPool(host='motthegioi.vn', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7003247936b0>: Failed to resolve 'motthegioi.vn' ([Errno -2] Name or service not known)"))
Retrying... (1/2)
Processed URL: http://gawker.com
Error fetching content from http://motthegioi.vn: HTTPConnectionPool(host='motthegioi.vn', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x700324792f30>: Failed to resolve 'motthegioi.vn' ([Errno -2] Name or service not known)"))
Processed URL: http://bdnews24.com
Processed URL: http://motthegioi.vn
Processed URL: http://diply.com
Processed URL: http://ifttt.com
Processed URL: http://conservativetribune.com
Processed URL: http://jalopnik.com
Processed URL: http://mirtesen.ru


2024-09-07 17:13:45,060 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


Processed URL: http://myegy.to
Processed URL: http://qz.com
Processed URL: http://kotaku.com
Processed URL: http://aljazeera.net
Processed URL: http://himado.in
Processed URL: http://getpocket.com
Processed URL: http://nguyentandung.org
Processed URL: http://motherless.com
Error fetching content from http://emgn.com: HTTPConnectionPool(host='emgn.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x700347952600>: Failed to resolve 'emgn.com' ([Errno -3] Temporary failure in name resolution)"))
Retrying... (1/2)
Processed URL: http://depositphotos.com
Processed URL: http://avxhome.se
Error fetching content from http://olx.ua: 403 Client Error: Forbidden for url: https://olx.ua/
Retrying... (1/2)
Error fetching content from http://caixa.gov.br: HTTPSConnectionPool(host='caixa.gov.br', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certi

2024-09-07 17:13:50,767 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno 111] Connection refused


Error processing URL: http://buzzfil.net. Error: HTTPConnectionPool(host='buzzfil.net', port=80): Read timed out. (read timeout=10)
Error fetching content from http://oneplus.net: HTTPConnectionPool(host='oneplus.net', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x700348601bb0>, 'Connection to oneplus.net timed out. (connect timeout=10)'))
Retrying... (1/2)
Error fetching content from http://codepen.io: 403 Client Error: Forbidden for url: https://codepen.io/
Processed URL: http://codepen.io
Error processing URL: http://grantland.com. Error: HTTPConnectionPool(host='grantland.com', port=80): Read timed out. (read timeout=10)


2024-09-07 17:13:51,052 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


Error fetching content from http://emgn.com: HTTPConnectionPool(host='emgn.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7002d8608140>: Failed to resolve 'emgn.com' ([Errno -3] Temporary failure in name resolution)"))
Processed URL: http://emgn.com
Error fetching content from http://filehippo.com: 406 Client Error: Not Acceptable for url: https://filehippo.com/
Processed URL: http://filehippo.com
Error fetching content from http://suumo.jp: HTTPConnectionPool(host='suumo.jp', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x700347980830>, 'Connection to suumo.jp timed out. (connect timeout=10)'))
Retrying... (1/2)
Error fetching domain info for telegraf.com.ua: unsupported operand type(s) for -: 'datetime.datetime' and 'str'
Processed URL: http://watch-series-tv.to
Error fetching domain info for seasonvar.ru: No entries found for the 

2024-09-07 17:13:53,670 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno 111] Connection refused


Processed URL: http://themeforest.net
Processed URL: http://1337x.to
Processed URL: http://udn.com
Error fetching content from http://irecommend.ru: 521 Server Error:  for url: https://irecommend.ru
Retrying... (1/2)
Error processing URL: http://europa.eu. Error: HTTPSConnectionPool(host='european-union.europa.eu', port=443): Read timed out. (read timeout=10)
Processed URL: http://h2porn.com
Error fetching content from http://censor.net.ua: 403 Client Error: Forbidden for url: https://censor.net/
Retrying... (1/2)
Processed URL: http://ink361.com
Error fetching content from http://extratorrent.cc: HTTPConnectionPool(host='extratorrent.cc', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7002d18dd6a0>: Failed to resolve 'extratorrent.cc' ([Errno -5] No address associated with hostname)"))
Retrying... (1/2)
Error fetching content from http://extratorrent.cc: HTTPConnectionPool(host='extratorrent.cc', port=80): Max 

2024-09-07 17:13:56,535 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno 111] Connection refused


Processed URL: http://torcache.net
Error fetching content from http://irecommend.ru: 521 Server Error:  for url: https://irecommend.ru
Processed URL: http://irecommend.ru
Processed URL: http://ck101.com
Error fetching content from http://likemag.com: HTTPSConnectionPool(host='frauenseite.net', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate (_ssl.c:1000)')))
Processed URL: http://likemag.com
Processed URL: http://serverfault.com
Error fetching content from http://perezhilton.com: 403 Client Error: Forbidden for url: https://perezhilton.com/
Retrying... (1/2)
Processed URL: http://tobogo.net
Processed URL: http://yourlust.com
Processed URL: http://prezi.com
Processed URL: http://teespring.com
Processed URL: http://livetv.sx
Processed URL: http://twitter.com
Processed URL: http://anysex.com
Error fetching content from http://censor.net.ua: 403 Client Error: For

2024-09-07 17:13:58,847 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


Error fetching content from http://sourceforge.net: 403 Client Error: Forbidden for url: http://sourceforge.net/
Processed URL: http://sourceforge.net
Processed URL: http://tebyan.net
Processed URL: http://soundcloud.com
Error fetching content from http://putlocker.is: 439 Client Error:  for url: http://ww16.putlocker.is/?sub1=20240908-0213-57d0-b2b7-ccacef2b8bf3
Retrying... (1/2)
Error fetching content from http://sfglobe.com: 404 Client Error: Not Found for url: https://sfglobe.com:443/
Retrying... (1/2)
Processed URL: http://nesn.com
Error fetching content from http://creativemarket.com: 403 Client Error: Forbidden for url: https://creativemarket.com/
Processed URL: http://creativemarket.com
Processed URL: http://fanpage.gr
Error processing URL: http://iconosquare.com. Error: HTTPConnectionPool(host='iconosquare.com', port=80): Read timed out. (read timeout=10)
Processed URL: http://momoshop.com.tw
Processed URL: http://ap.org
Processed URL: http://elcomercio.pe
Error fetching conte

2024-09-07 17:14:08,992 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno 111] Connection refused


Processed URL: http://bombayelectricstore.com
Error fetching content from http://bbsignage.com: HTTPConnectionPool(host='bbsignage.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7002d6fbfad0>: Failed to resolve 'bbsignage.com' ([Errno -2] Name or service not known)"))
Retrying... (1/2)
Error fetching content from http://bbsignage.com: HTTPConnectionPool(host='bbsignage.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7002d1b52f90>: Failed to resolve 'bbsignage.com' ([Errno -2] Name or service not known)"))
Processed URL: http://bbsignage.com
Error fetching content from http://googlefoundation.somee.com: 404 Client Error: Not Found for url: http://googlefoundation.somee.com/
Processed URL: http://googlefoundation.somee.com
Processed URL: http://medexsc-my.sharepoint.com
Error fetching content from http://123456bet.com: HTTPConnec

2024-09-07 17:14:14,090 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


Processed URL: http://envirodrilling.com
Error fetching content from http://dukhovnist.in.ua: 403 Client Error: Forbidden for url: http://dukhovnist.in.ua/
Retrying... (1/2)
Error fetching content from http://ox-aruba.blogspot.com: 404 Client Error: Not Found for url: http://ox-aruba.blogspot.com/
Retrying... (1/2)
Processed URL: http://link.do
Processed URL: http://icipedudu-my.sharepoint.com
Processed URL: http://starliker.net
Error fetching content from http://storm-friendly-property.glitch.me: 403 Client Error: Forbidden for url: http://storm-friendly-property.glitch.me/
Retrying... (1/2)
Processed URL: http://qfreeaccountssjc1.az1.qualtrics.com
Processed URL: http://diathermiki.gr
Processed URL: http://ilardo.com
Error fetching content from http://btiesolutions.com: HTTPConnectionPool(host='btiesolutions.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7002d48fb620>: Failed to resolve 'btiesolutions.com

In [84]:
                                       #5. Data Preprocessing
print("Running Model training and evaluation")

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Preprocess features
processed_df = pd.DataFrame(X)
processed_df['Label'] = y  # Add the labels to the new dataframe   
processed_df.head()

Running Model training and evaluation


Unnamed: 0,url_length,contains_ip,shortening_services,special_chars,html_length,js_length,num_links,num_forms,shortening_service,num_subdomains,domain_age,has_ssl,Label
0,21,0,0,3,0,0,0,0,0.0,0.0,0.0,0,0
1,17,0,1,3,0,0,0,0,0.0,0.0,0.0,0,0
2,18,0,0,0,7375,6492,0,0,0.0,0.0,5888.0,0,0
3,22,0,0,3,0,0,0,0,0.0,0.0,0.0,0,0
4,22,0,0,0,149764,30200,165,0,0.0,0.0,5637.0,1,0


In [86]:
                                        #5. Data Preprocessing
# Fill NaN values with 0 in X
processed_df = processed_df.fillna(0)

# Verify if any NaN values remain
print(f"Remaining NaN values: {processed_df.isna().sum().sum()}")  # This should print 0 if no NaNs are left
processed_df.head()

Remaining NaN values: 0


Unnamed: 0,url_length,contains_ip,shortening_services,special_chars,html_length,js_length,num_links,num_forms,shortening_service,num_subdomains,domain_age,has_ssl,Label
0,21,0,0,3,0,0,0,0,0.0,0.0,0.0,0,0
1,17,0,1,3,0,0,0,0,0.0,0.0,0.0,0,0
2,18,0,0,0,7375,6492,0,0,0.0,0.0,5888.0,0,0
3,22,0,0,3,0,0,0,0,0.0,0.0,0.0,0,0
4,22,0,0,0,149764,30200,165,0,0.0,0.0,5637.0,1,0


In [92]:
                                 #6-7. Feature Scaling and Feature Selection
try:
    # Split data
    X = processed_df.drop(columns=['URL', 'Label'])
    y = processed_df['Label']
except: 
    print('URL and Label have been dropped and not found in the dataframe')
finally:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

URL and Label have been dropped and not found in the dataframe


In [94]:
                #8. Machine Learning Models such as RandomForest, Support Vector, and GradentBoosting

#Define classifiers
classifiers = {
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'GradientBoosting': GradientBoostingClassifier()
}

In [96]:
                                            #9. Hyperparameter Tuning
param_grids = {
    'RandomForest': {'n_estimators': [100, 200], 'max_depth': [10, 20]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
    'GradientBoosting': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]}
}

In [98]:
                              #10. Model selection, training and evaluation metrics
best_model = None
best_score = 0

# Model selection, training and evaluation
for name, clf in classifiers.items():
    print(f"Training {name}...")
    
    grid_search = GridSearchCV(clf, param_grids[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train_scaled, y_train)
    
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"{name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")
    
    if accuracy > best_score:
        best_score = accuracy
        best_model = model

Training RandomForest...
RandomForest - Accuracy: 0.717948717948718, Precision: 0.6774193548387096, Recall: 0.6363636363636364, F1 Score: 0.65625
Training SVM...
SVM - Accuracy: 0.7435897435897436, Precision: 0.6666666666666666, Recall: 0.7878787878787878, F1 Score: 0.7222222222222222
Training GradientBoosting...
GradientBoosting - Accuracy: 0.6794871794871795, Precision: 0.6333333333333333, Recall: 0.5757575757575758, F1 Score: 0.6031746031746031


In [102]:
                                       #11. Final Model Selection
print(f"Best model: {best_model}")

Best model: SVC(C=10, kernel='linear')


In [104]:
                                       #12. Classification
def classify_url(url, model, scaler, feature_columns_order):
    features = extract_features(url)
    features_df = pd.DataFrame([features])
    
    # Ensure all required columns are present
    for col in feature_columns_order:
        if col not in features_df.columns:
            features_df[col] = 0
    
    features_df = features_df[feature_columns_order]
    
    # Remove any NaN values
    features_df = features_df.fillna(0)
    
    # Scale and predict
    features_scaled = scaler.transform(features_df)
    prediction = model.predict(features_scaled)
    
    return "Legitimate" if prediction == 0 else "Phishing"

# Enter new url
new_url = "http://gembrite.co.uk"
classification_result = classify_url(new_url, best_model, scaler, X.columns)
print(f"The URL '{new_url}' is classified as: {classification_result}")

The URL 'http://gembrite.co.uk' is classified as: Phishing


  features_df = features_df.fillna(0)
