# Malicious URL Detection with XGBoost
This notebook demonstrates feature extraction from URLs and classification of malicious URLs using XGBoost. It covers data loading, feature engineering, model training, and prediction steps.

## 1. Import Required Libraries
Import pandas, scikit-learn, xgboost, urllib, re, pickle, and other necessary libraries.

In [14]:
import pandas as pd
from urllib.parse import urlparse as url_parse
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import re
from googlesearch import search
import pickle
import xgboost as xgb
from tld import get_tld
import os.path
import math

## 2. Define URL Feature Extraction Functions
Implement functions to extract features from URLs, such as having_ip_address, abnormal_url, count_dot, shortening_service, suspicious_words, digit_count, letter_count, fd_length, tld_length, etc.

In [15]:
# URL Feature Extraction Functions
def having_ip_address(url):
    match = re.search(r"(([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5]))|((0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2}))(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}", url)
    return 1 if match else 0

def abnormal_url(url):
    parsed_url = url_parse(url)
    hostname = str(parsed_url.hostname)
    match = re.search(hostname, url)
    return 1 if match else 0

def search_google(url):
    site = search(url, 5)
    return 1 if site else 0

def count_dot(url):
    return url.count(".")

def count_www(url):
    return url.count("www")

def count_atrate(url):
    return url.count("@")

def no_of_dir(url):
    urldir = url_parse(url).path
    return urldir.count("/")

def no_of_embed(url):
    urldir = url_parse(url).path
    return urldir.count("//")

def shortening_service(url):
    match = re.search(r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|qr\.ae|adataset\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net", url)
    return 1 if match else 0

def count_https(url):
    return url.count("https")

def count_http(url):
    return url.count("http")

def count_per(url):
    return url.count("%")

def count_ques(url):
    return url.count("?")

def count_hyphen(url):
    return url.count("-")

def count_equal(url):
    return url.count("=")

def url_length(url):
    return len(str(url))

def hostname_length(url):
    return len(url_parse(url).netloc)

def suspicious_words(url):
    match = re.search(r"PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr", url)
    return 1 if match else 0

def digit_count(url):
    return sum(1 for i in url if i.isnumeric())

def letter_count(url):
    return sum(1 for i in url if i.isalpha())

def fd_length(url):
    try:
        path = url_parse(url).path
        first_dir = path.split('/')[1] if len(path.split('/')) > 1 else ''
        return len(first_dir)
    except Exception:
        return 0

def tld_length(tld):
    try:
        return len(str(tld))
    except Exception:
        return 0

## 3. Load and Explore Dataset
Load the 'malicious_phish.csv' dataset and display basic statistics and sample rows.

In [16]:
# Load the dataset
DATA_PATH = r'C:\Users\ajayk\OneDrive\Documents\url\malicious_phish.csv'
dataset = pd.read_csv(DATA_PATH)
print('Dataset shape:', dataset.shape)
dataset.head()
dataset.describe()
dataset['type'].value_counts()

Dataset shape: (651191, 2)


type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

## 4. Feature Engineering on URLs
Apply feature extraction functions to the dataset to create new columns for each URL feature.

In [17]:
# Apply feature extraction functions to dataset
dataset['use_of_ip'] = dataset['url'].apply(having_ip_address)
dataset['abnormal_url'] = dataset['url'].apply(abnormal_url)
dataset['google_index'] = dataset['url'].apply(search_google)
dataset['count.'] = dataset['url'].apply(count_dot)
dataset['count-www'] = dataset['url'].apply(count_www)
dataset['count@'] = dataset['url'].apply(count_atrate)
dataset['count_dir'] = dataset['url'].apply(no_of_dir)
dataset['count_embed_domian'] = dataset['url'].apply(no_of_embed)
dataset['short_url'] = dataset['url'].apply(shortening_service)
dataset['count-https'] = dataset['url'].apply(count_https)
dataset['count-http'] = dataset['url'].apply(count_http)
dataset['count%'] = dataset['url'].apply(count_per)
dataset['count?'] = dataset['url'].apply(count_ques)
dataset['count-'] = dataset['url'].apply(count_hyphen)
dataset['count='] = dataset['url'].apply(count_equal)
dataset['url_length'] = dataset['url'].apply(url_length)
dataset['hostname_length'] = dataset['url'].apply(hostname_length)
dataset['sus_url'] = dataset['url'].apply(suspicious_words)
dataset['count-digits'] = dataset['url'].apply(digit_count)
dataset['count-letters'] = dataset['url'].apply(letter_count)
dataset['fd_length'] = dataset['url'].apply(fd_length)
dataset['tld'] = dataset['url'].apply(lambda i: get_tld(i, fail_silently=True))
dataset['tld_length'] = dataset['tld'].apply(tld_length)
dataset.head()

Unnamed: 0,url,type,use_of_ip,abnormal_url,google_index,count.,count-www,count@,count_dir,count_embed_domian,...,count-,count=,url_length,hostname_length,sus_url,count-digits,count-letters,fd_length,tld,tld_length
0,br-icloud.com.br,phishing,0,0,1,2,0,0,0,0,...,1,0,16,0,0,0,13,0,,4
1,mp3raid.com/music/krizz_kaliko.html,benign,0,0,1,2,0,0,2,0,...,0,0,35,0,0,1,29,5,,4
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,0,1,2,0,0,3,0,...,0,0,31,0,0,1,25,7,,4
3,http://www.garage-pirenne.be/index.php?option=...,defacement,0,1,1,3,1,0,1,0,...,1,4,88,21,0,7,63,9,be,2
4,http://adventure-nicaragua.net/index.php?optio...,defacement,0,1,1,2,0,0,1,0,...,1,3,235,23,0,22,199,9,net,3


## 5. Train/Test Split and Data Preprocessing
Split the dataset into training and test sets, and apply scaling and label encoding.

In [18]:
# Prepare features and labels
feature_columns = ['use_of_ip', 'abnormal_url', 'count.', 'count-www', 'count@',
    'count_dir', 'count_embed_domian', 'short_url', 'count-https', 'count-http',
    'count%', 'count?', 'count-', 'count=', 'url_length', 'hostname_length',
    'sus_url', 'fd_length', 'tld_length', 'count-digits', 'count-letters']
X = dataset[feature_columns]
y = dataset['type']

# Drop rows with missing values
X = X.dropna()
y = y[X.index]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, shuffle=True, random_state=5)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

## 6. Train XGBoost Model
Train an XGBoost classifier on the processed features and labels.

In [20]:
# Train XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100)
xgb_model.fit(X_train_scaled, y_train_encoded)

# Evaluate model
score = xgb_model.score(X_test_scaled, y_test_encoded)
print(f"Test Accuracy: {score:.4f}")

Test Accuracy: 0.9615


## 6a. Hyperparameter Tuning for XGBoost
Use GridSearchCV to find the best hyperparameters for the XGBoost model.

In [21]:
# Hyperparameter tuning for XGBoost using GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}
xgb_clf = xgb.XGBClassifier()
grid_search = GridSearchCV(xgb_clf, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_encoded)
print('Best parameters:', grid_search.best_params_)
print('Best cross-validation accuracy:', grid_search.best_score_)
best_xgb_model = grid_search.best_estimator_
score = best_xgb_model.score(X_test_scaled, y_test_encoded)
print(f'Test Accuracy with best parameters: {score:.4f}')

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best cross-validation accuracy: 0.9657665191884384
Test Accuracy with best parameters: 0.9655


In [22]:
# Train XGBoost with best parameters from GridSearchCV
best_params = grid_search.best_params_
xgb_best = xgb.XGBClassifier(**best_params)
xgb_best.fit(X_train_scaled, y_train_encoded)
score = xgb_best.score(X_test_scaled, y_test_encoded)
print(f'Test Accuracy (best params): {score:.4f}')

Test Accuracy (best params): 0.9655


In [None]:
# Predict for a new URL using the best XGBoost model
new_url = 'http://example.com/login?user=test'
features = preprocess_url(new_url)
features_scaled = scaler.transform([features])
prediction = xgb_best.predict(features_scaled)
predicted_label = label_encoder.inverse_transform(prediction)
print(f'Prediction for URL: {new_url} => {predicted_label[0]}')

## 7. Save Model and Preprocessing Objects
Save the trained model, scaler, and label encoder using pickle for later use.

In [23]:
# Save model and preprocessing objects
with open('model.pkl', 'wb') as model_file:
    pickle.dump(xgb_model, model_file)
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)
with open('label_encoder.pkl', 'wb') as label_encoder_file:
    pickle.dump(label_encoder, label_encoder_file)

## 8. Preprocess New URL for Prediction
Demonstrate how to preprocess a new URL and prepare its features for model prediction.

In [26]:
# Preprocess a new URL for prediction
def preprocess_url(url):
    features = [
        having_ip_address(url),
        abnormal_url(url),
        count_dot(url),
        count_www(url),
        count_atrate(url),
        no_of_dir(url),
        no_of_embed(url),
        shortening_service(url),
        count_https(url),
        count_http(url),
        count_per(url),
        count_ques(url),
        count_hyphen(url),
        count_equal(url),
        url_length(url),
        hostname_length(url),
        suspicious_words(url),
        fd_length(url),
        tld_length(get_tld(url, fail_silently=True)),
        digit_count(url),
        letter_count(url)
    ]
    return features

# Example usage
new_url = 'http://secure-bank-account-update.com'
features = preprocess_url(new_url)
features_scaled = scaler.transform([features])
prediction = xgb_model.predict(features_scaled)
predicted_label = label_encoder.inverse_transform(prediction)
print(f"Prediction for URL: {new_url} => {predicted_label[0]}")

Prediction for URL: http://secure-bank-account-update.com => phishing


