In [2]:
#1-2. Web URL input and Feature Selection
print("Running web url input and Feature Selection")

import pandas as pd
import requests
from bs4 import BeautifulSoup
import whois
from datetime import datetime
import socket
import ssl
from urllib.parse import urlparse
import requests
from urllib3.exceptions import NewConnectionError, MaxRetryError
from requests.exceptions import ConnectionError
import time
import re

# Define headers to mimic a real browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Referer': 'https://google.com',  # Optional

}

# Fetch URL with retries
def fetch_url(url, retries=2):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx, 5xx)
            return response.content
        except (requests.exceptions.RequestException, NewConnectionError, MaxRetryError, ConnectionError):
            # Suppress error details and retry
            time.sleep(2)  # Delay between retries
    # If all retries fail, return None
    return None

                
#------------------------------------------------------------------------------------------------------------------
                                             #Domain-based Feature
# Function to check SSL certificate
def check_ssl(domain):
    try:
        context = ssl.create_default_context()
        with socket.create_connection((domain, 443)) as sock:
            with context.wrap_socket(sock, server_hostname=domain) as secure_sock:
                return True
    except:
        return False
#------------------------------------------------------------------------------------------------------------------
                                           

                                           
#-------------------------------------------------------------------------------------------------------------------
                                               #URL-based Feature
# Check for IP address in URL
def contains_ip(url):
    ip_pattern = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
    return bool(ip_pattern.search(url))

#--------------------------------------------------------------------------------------------------------------------
                                              #URL-based feature
# Compile the shortening services regex pattern
shortening_services_pattern = re.compile(r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|"
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|"
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|"
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|"
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|"
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|"
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|"
                      r"tr\.im|link\.zip\.net")

# Count special characters in URL
def count_special_chars(url):
    return int(sum(not c.isalnum() and c not in ['.', '-', '_', ':', '/', '?', '&', '=', '%'] for c in url))

# Check for URL shortening services using the provided regex pattern
def shortening_services(url):
    return bool(re.search(shortening_services_pattern, url))

#----------------------------------------------------------------------------------------------------------------------

# Extract features from URL
def extract_features(url):
    features = {}
    domain = urlparse(url).netloc
    #-----------------------------------------------------------------------------------------------------------------
                                              #URL-based features
    features['url_length'] = int(len(url))
    features['contains_ip'] = int(contains_ip(url))
    features['shortening_services'] = int(shortening_services(url))
    features['special_chars'] = int(count_special_chars(url))
    #-----------------------------------------------------------------------------------------------------------------
    content = fetch_url(url)
    if content:
        soup = BeautifulSoup(content, 'html.parser')
        #-------------------------------------------------------------------------------------------------------------
                                              #Content-based features
        features['html_length'] = int(len(content))
        features['js_length'] = sum(len(s.string) for s in soup.find_all('script') if s.string)
        features['num_links'] = len(soup.find_all('a'))
        features['num_forms'] = len(soup.find_all('form'))
        
       
        #-------------------------------------------------------------------------------------------------------------
    
        try:
            #---------------------------------------------------------------------------------------------------------
                                              #Domain-based features
            domain_info = whois.whois(domain)
            creation_date = domain_info.creation_date
            updated_date = domain_info.updated_date
            expiration_date = domain_info.expiration_date
            
            if isinstance(creation_date, list):
                creation_date = creation_date[0]
            if isinstance(updated_date, list):
                updated_date = updated_date[0]
            if isinstance(expiration_date, list):
                expiration_date = expiration_date[0]
            
            features['domain_age'] = (datetime.now() - creation_date).days if creation_date else None
        except Exception as e:
            print(f"Error fetching domain info for {domain}: {e}")
            features['domain_age'] = None

        features['has_ssl'] = 1 if int(check_ssl(domain)) else 0
        #------------------------------------------------------------------------------------------------------------------

        features.update(features)
    else:
        features.update({
            'html_length': 0,
            'js_length': 0,
            'num_links': 0,
            'num_forms': 0,
            'contains_ip': 0,
            'shortening_services': 0,
            'url_length': len(url),
            'special_chars': sum(not c.isalnum() and c not in ['.', '-', '_'] for c in url),
            'domain_age': None,
            'has_ssl': 0,
        })
    return features

Running web url input and Feature Selection


In [3]:
                                              #3. Feature Vector
print("Extract features running")
# Function to process a single URL
def process_url(url):
    try:
        features = extract_features(url)
        return {'url': url, **features}
    except Exception as e:
        print(f"Error processing URL: {url}. Error: {e}")
        return None


Extract features running


In [111]:
                             #5. Data Preprocessing - Randomly process 200 URLs for each label
import pandas as pd
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

print("Data preprocessing running")
def preprocess_data(df, processed_df, n=100):
    # Separate URLs by label (0 for legitimate, 1 for phishing)
    legitimate_urls = df[df['Label'] == 0]['url'].tolist()
    phishing_urls = df[df['Label'] == 1]['url'].tolist()
    
    # Filter out URLs that have already been processed
    already_processed_urls = set(processed_df['url'].tolist()) if not processed_df.empty else set()
    legitimate_urls = [url for url in legitimate_urls if url not in already_processed_urls]
    phishing_urls = [url for url in phishing_urls if url not in already_processed_urls]

    # Randomly sample 1000 URLs from each group (or fewer if not enough remain)
    legitimate_sample = random.sample(legitimate_urls, min(n, len(legitimate_urls)))
    phishing_sample = random.sample(phishing_urls, min(n, len(phishing_urls)))

    all_samples = legitimate_sample + phishing_sample
    results = []
    
    # Multithreading to process URLs faster
    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = {executor.submit(process_url, url): url for url in all_samples}
        for future in as_completed(futures):
            url = futures[future]
            try:
                result = future.result()
                if result:  # Only append if the URL is successfully processed
                    results.append(result)
                    print(f"Processed URL: {url}")
            except Exception as e:
                print(f"Error processing URL: {url}. Error: {e}")

    # Convert results to DataFrame
    X = pd.DataFrame(results)
    
    # Ensure the corresponding labels match the processed URLs
    y = df.loc[df['url'].isin(X['url']), 'Label'].values
    print(X)

    return X, y

Data preprocessing running


In [10]:
print("Loading Historical Dataset")
import pandas as pd
import numpy as np

# Load datasets
df1 = pd.read_csv('legitimate_urls.csv')
df2 = pd.read_csv('phishing_urls.csv')

# "legitimate" is identified as  0 in df1 and"phishing" is identified as 1 in df2

# Concatenate the datasets
df_combined = pd.concat([df1, df2], ignore_index=True)

#add http:// to the domain and create a column url
# Prepend 'http://' to the 'Domain' column
df_combined['url'] = df_combined['Domain'].apply(lambda x: f'http://{x}' if not x.startswith('https://') else x)

#remove duplicates
df = df_combined.drop_duplicates(subset='url')  # Remove duplicates
# Assuming we have a dataframe 'processed_df' of already processed URLs, or create an empty one

processed_df = pd.DataFrame(columns=['url'])

# Preprocess the data (sample 1000 legitimate and 200 phishing URLs for feature extraction)
X, y = preprocess_data(df, processed_df, n=100)

# At this point, `X` contains the feature set and `y` contains the corresponding labels for training
print(f"Feature matrix shape: {X.shape}")
print(f"Labels shape: {len(y)}")
# X and y can now be used for model training

Loading Historical Dataset
Processed URL: http://mashable.com
Error fetching domain info for telegraf.com.ua: unsupported operand type(s) for -: 'datetime.datetime' and 'str'


2024-09-16 14:48:46,478 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno 111] Connection refused


Processed URL: http://elitedaily.com
Processed URL: http://telegraf.com.ua
Processed URL: http://twitter.com
Processed URL: http://teespring.com
Processed URL: http://patch.com
Processed URL: http://manager.co.th
Processed URL: http://metro.co.uk
Processed URL: http://ameblo.jp
Processed URL: http://myegy.to
Error fetching domain info for mirtesen.ru: No entries found for the selected source(s).

>>> Last update of WHOIS database: 2024.09.16T16:48:39Z <<<

Processed URL: http://akhbarelyom.com
Processed URL: http://motthegioi.vn
Processed URL: http://europa.eu
Processed URL: http://mainichi.jp
Processed URL: http://askubuntu.com
Processed URL: http://extratorrent.cc
Processed URL: http://avxhome.se
Processed URL: http://indianexpress.com
Processed URL: http://livetv.sx
Processed URL: http://nymag.com
Processed URL: http://nguyentandung.org
Processed URL: http://tobogo.net
Processed URL: http://mirtesen.ru
Processed URL: http://paytm.com
Processed URL: http://doodle.com
Processed URL: h

2024-09-16 14:48:53,264 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno 111] Connection refused
2024-09-16 14:48:53,762 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


Processed URL: http://fortune.com
Processed URL: http://iconosquare.com
Processed URL: http://xueqiu.com
Processed URL: http://moviepilot.com
Processed URL: http://mentalfloss.com
Processed URL: http://kickass.to
Processed URL: http://css-tricks.com
Processed URL: http://katproxy.com
Processed URL: http://pantip.com
Processed URL: http://wikimapia.org
Processed URL: http://subscene.com
Error fetching domain info for seasonvar.ru: No entries found for the selected source(s).

>>> Last update of WHOIS database: 2024.09.16T16:48:46Z <<<

Processed URL: http://kenh14.vn
Processed URL: http://filehippo.com
Error fetching domain info for privatbank.ua: unsupported operand type(s) for -: 'datetime.datetime' and 'str'
Processed URL: http://9gag.tv
Processed URL: http://anysex.com
Processed URL: http://bongda88.info
Processed URL: http://tsite.jp
Processed URL: http://stackoverflow.com
Processed URL: http://mediaset.it
Processed URL: http://shareba.com
Processed URL: http://myspace.com
Processe

2024-09-16 14:48:57,298 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


Processed URL: http://xvideo-jp.com
Processed URL: http://bluegape.com
Processed URL: http://sourceforge.net


2024-09-16 14:48:57,598 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Name or service not known


Error fetching domain info for proxyx.ru: No entries found for the selected source(s).

>>> Last update of WHOIS database: 2024.09.16T16:48:48Z <<<

Processed URL: http://bdnews24.com
Processed URL: http://bitbucket.org
Processed URL: http://clien.net
Processed URL: http://amoory.com
Processed URL: http://themeforest.net
Processed URL: http://fanpage.gr
Processed URL: http://olx.pl
Processed URL: http://kienthuc.net.vn
Processed URL: http://proxyx.ru
Processed URL: http://dealnews.com
Processed URL: http://xhamster.com
Processed URL: http://truckcalling.com
Processed URL: http://khabaronline.ir
Processed URL: http://kakaku.com
Error fetching domain info for serverinfo1policy.blogspot.com: No match for "SERVERINFO1POLICY.BLOGSPOT.COM".
>>> Last update of whois database: 2024-09-16T13:48:39Z <<<

NOTICE: The expiration date displayed in this record is the date the
registrar's sponsorship of the domain name registration in the registry is
currently set to expire. This date does not necess

In [101]:
                                       #5. Data Preprocessing
print("Running Data Preprocessing")

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Preprocess features
processed_df = pd.DataFrame(X)
processed_df['Label']= y
processed_df.head()

Running Data Preprocessing


Unnamed: 0,url_length,contains_ip,shortening_services,special_chars,html_length,js_length,num_links,num_forms,domain_age,has_ssl,Label
0,19,0,0,0,215572,68466,359,1,6998.0,1,0
1,21,0,0,0,339507,53337,158,3,4765.0,1,0
3,18,0,0,0,2610,136,0,0,9004.0,1,0
4,20,0,0,0,43273,20758,55,0,4967.0,1,0
5,16,0,0,0,302163,131333,216,1,11157.0,1,0


In [97]:
                                        #5. Data Preprocessing
# Remove rows with any NaN values in the dataframe
processed_df = processed_df.dropna()

# Check if there are any NaN values left (should print 0 for all columns)
print(processed_df.isnull().sum())

# Verify if any NaN values remain
print(f"Remaining NaN values: {processed_df.isna().sum().sum()}")  # This should print 0 if no NaNs are left
#save to csv file

processed_df.head()

url_length             0
contains_ip            0
shortening_services    0
special_chars          0
html_length            0
js_length              0
num_links              0
num_forms              0
domain_age             0
has_ssl                0
Label                  0
dtype: int64
Remaining NaN values: 0


Unnamed: 0,url_length,contains_ip,shortening_services,special_chars,html_length,js_length,num_links,num_forms,domain_age,has_ssl,Label
0,19,0,0,0,215572,68466,359,1,6998.0,1,0
1,21,0,0,0,339507,53337,158,3,4765.0,1,0
3,18,0,0,0,2610,136,0,0,9004.0,1,0
4,20,0,0,0,43273,20758,55,0,4967.0,1,0
5,16,0,0,0,302163,131333,216,1,11157.0,1,0


In [14]:
                                              #5. Data Preprocessing
'''
#Seperate the legitimate represented as 0 and phishing representd as 1 dataframes
legit_df = processed_df[processed_df['Label'] == 0].reset_index(drop=True)  # Rows with status == 0
phish_df = processed_df[processed_df['status'] == 1].reset_index(drop=True)    # Rows with status == 1
#concat the legitimate and phishing
# Concatenate legitimate_df and phishing_df
processed_df = pd.concat([legit_df, phish_df], ignore_index=True)
'''


"\n#Seperate the legitimate represented as 0 and phishing representd as 1 dataframes\nlegit_df = processed_df[processed_df['Label'] == 0].reset_index(drop=True)  # Rows with status == 0\nphish_df = processed_df[processed_df['status'] == 1].reset_index(drop=True)    # Rows with status == 1\n#concat the legitimate and phishing\n# Concatenate legitimate_df and phishing_df\nprocessed_df = pd.concat([legit_df, phish_df], ignore_index=True)\n"

In [103]:
                              #6-7. Feature Scaling and Feature Selection

try:
    # Split data
    X = processed_df.drop(columns=['url', 'Label'])
    y = processed_df['Label']
except: 
    print('url and Label have been dropped and not found in the dataframe')
finally:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

url and Label have been dropped and not found in the dataframe


In [60]:
                #8. Machine Learning Models such as RandomForest, Support Vector, and GradentBoosting

#Define classifiers
classifiers = {
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'GradientBoosting': GradientBoostingClassifier()
}

In [62]:
                                            #9. Hyperparameter Tuning
# Define parameter grids for GridSearchCV
param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    'GradientBoosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

In [64]:
                              #10. Model selection, training and evaluation metrics
best_model = None
best_score = 0

# Model selection, training and evaluation
for name, clf in classifiers.items():
    print(f"Training {name}...")
    
    grid_search = GridSearchCV(clf, param_grids[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train_scaled, y_train)
    
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test_scaled)

    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    mean_cv_score = cv_scores.mean()
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

     # Evaluate on test set
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')  # Use 'binary' for binary classification
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    print(f"{name} - Mean Cross-Validation Accuracy: {mean_cv_score:.4f}") 
    print(f"{name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")
    
    if accuracy > best_score:
        best_score = accuracy
        best_model = model

Training RandomForest...
RandomForest - Mean Cross-Validation Accuracy: 0.6487
RandomForest - Accuracy: 0.8125, Precision: 0.5, Recall: 0.6666666666666666, F1 Score: 0.5714285714285714
Training SVM...
SVM - Mean Cross-Validation Accuracy: 0.7295
SVM - Accuracy: 0.875, Precision: 1.0, Recall: 0.3333333333333333, F1 Score: 0.5
Training GradientBoosting...
GradientBoosting - Mean Cross-Validation Accuracy: 0.6795
GradientBoosting - Accuracy: 0.8125, Precision: 0.5, Recall: 0.3333333333333333, F1 Score: 0.4


In [67]:
                                       #11. Final Model Selection
print(f"Best model: {best_model}")

Best model: SVC(C=0.1, kernel='linear')


In [71]:
                                       #12. Classification
def classify_url(url, model, scaler, feature_columns_order):
    features = extract_features(url)
    features_df = pd.DataFrame([features])
    # Ensure all required columns are present
    for col in feature_columns_order:
        if col not in features_df.columns:
            features_df[col] = 0
    
    features_df = features_df[feature_columns_order]
    
    # Remove any NaN values
    features_df = features_df.fillna(0)
    
    # Scale and predict
    features_scaled = scaler.transform(features_df)
    prediction = model.predict(features_scaled)
    print(prediction)
    return "Legitimate" if prediction == 0 else "Phishing"

In [149]:
# Enter new url
new_url = 'http://345666'

In [151]:
classification_result = classify_url(new_url, best_model, scaler, X.columns)
print(f"The URL '{new_url}' is classified as: {classification_result}")

[1]
The URL 'http://345666' is classified as: Phishing


  features_df = features_df.fillna(0)


In [155]:
#Import the model to use to create a web interface
import pickle

# Save the model
with open('model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

In [157]:
# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

In [159]:
X.to_csv('processed_data.csv', index=False)