<a href="https://colab.research.google.com/github/Ahsa21/colab-project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project
# Malicious URL Classification
# Feature-Based Detection of Benign, Defacement,Phishing, and Malware URLs

Jeet Purohit, ahmad saloukha
DVAMI22h
jepu20@student.bth.se, ahsa22@student.bth.se

In [None]:
#imports

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import entropy
import time
import seaborn as sns

# evaluation measures
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


# scalers
from sklearn.preprocessing import StandardScaler

# Algorithms

from sklearn.svm import LinearSVC # SVC impractial with over 10000 samples
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# load dataset
dataset = pd.read_csv("malicious_URLS.csv", sep=",")
dataset.head(5)

# The class names and the total samples of each class

In [None]:
print(f"Total instances: {dataset.shape[0]}, Total Attributes: {dataset.shape[1]}\n")

class0 = (dataset["label"] == 0).sum()
class1 = (dataset["label"] == 1).sum()
class2 = (dataset["label"] == 2).sum()
class3 = (dataset["label"] == 3).sum()
class0_percentage = (class0 / dataset.shape[0] * 100)
class1_percentage = (class1 / dataset.shape[0] * 100)
class2_percentage = (class2 / dataset.shape[0] * 100)
class3_percentage = (class3 / dataset.shape[0] * 100)


print(f"Class 0 (benign URLs) samples: {class0}      | Class 0 %: {(class0_percentage):.3}%\n"
      f"Class 1 (defacement URLs) samples: {class1}   | Class 1 %: {(class1_percentage):.3}%\n"
      f"Class 2 (phishing URLs) samples: {class2}     | Class 2 %: {(class2_percentage):.3}%\n"
      f"Class 3 (malware URLs) samples: {class3}      | Class 3 %: {(class3_percentage):.3}%\n"
      )

# Remove duplicate features
    - in the raw dataset there was dublicate features, after checking if they had the exact same values for each sample, we have removed them.


In [None]:
# Remove columns with .1 suffix
cols_to_remove = [col for col in dataset.columns if col.endswith('.1')]

if cols_to_remove:
    print(f"Found {len(cols_to_remove)} columns with '.1' suffix:")
    for col in cols_to_remove:
        print(f"  - {col}")

    dataset = dataset.drop(columns=cols_to_remove)
    print(f"\nRemoved columns with '.1' suffix")
    print(f"Dataset shape after cleanup: {dataset.shape}")
else:
    print("No columns with '.1' suffix found")

dataset.head(5)

In [None]:
dataset.shape

# Reduced the dataset
    - we will be using only 25% of all classes and we will still retain the overall distrubution.


In [None]:
# Keep 25% of data while maintaining class distribution

dataset_reduced, _ = train_test_split(dataset, train_size=0.25, stratify=dataset['label'], random_state=42)

print(f"Reduced dataset size: {len(dataset_reduced)}")
print(f"\nClass distribution:")
print(dataset_reduced['label'].value_counts().sort_index())

In [None]:
print(f"Total instances: {dataset_reduced.shape[0]}, Total Attributes: {dataset_reduced.shape[1]}\n")

class0 = (dataset_reduced["label"] == 0).sum()
class1 = (dataset_reduced["label"] == 1).sum()
class2 = (dataset_reduced["label"] == 2).sum()
class3 = (dataset_reduced["label"] == 3).sum()
class0_percentage = (class0 / dataset_reduced.shape[0] * 100)
class1_percentage = (class1 / dataset_reduced.shape[0] * 100)
class2_percentage = (class2 / dataset_reduced.shape[0] * 100)
class3_percentage = (class3 / dataset_reduced.shape[0] * 100)


print(f"Class 0 (benign URLs) samples: {class0}      | Class 0 %: {(class0_percentage):.3}%\n"
      f"Class 1 (defacement URLs) samples: {class1}   | Class 1 %: {(class1_percentage):.3}%\n"
      f"Class 2 (phishing URLs) samples: {class2}     | Class 2 %: {(class2_percentage):.3}%\n"
      f"Class 3 (malware URLs) samples: {class3}      | Class 3 %: {(class3_percentage):.3}%\n"
    )

In [None]:
# checking missing values

nulls = dataset_reduced.isna().sum().sum()

print("number of null",nulls)
print("\n")



In [None]:
# check z-core outliers
# ÄNDRA OCH KOLLA HUR MÅNGA MALICIOUS SAMPLES VI HAR OCH DERAS AVRAGE LENGHT. TITTA PÅ FLERA OCKSÅ

# Check for outliers in 'url_len' using IQR method
col = 'url_len'
Q1 = dataset_reduced[col].quantile(0.25)
Q3 = dataset_reduced[col].quantile(0.75)
IQR = Q3 - Q1
outliers_url_len = dataset_reduced[(dataset_reduced[col] < (Q1 - 1.5 * IQR)) | (dataset_reduced[col] > (Q3 + 1.5 * IQR))]
print(f"Outliers in '{col}': {outliers_url_len.shape[0]}")

col = 'letters'
Q1 = dataset_reduced[col].quantile(0.25)
Q3 = dataset_reduced[col].quantile(0.75)
IQR = Q3 - Q1
outliers_letters = dataset_reduced[(dataset_reduced[col] < (Q1 - 1.5 * IQR)) | (dataset_reduced[col] > (Q3 + 1.5 * IQR))]
print(f"Outliers in '{col}': {outliers_letters.shape[0]}")


#

In [None]:
# Analyze class distribution of outliers
print("="*60)
print("OUTLIER CLASS DISTRIBUTION ANALYSIS")
print("="*60)

# Get outliers for url_len
col = 'url_len'
Q1 = dataset_reduced[col].quantile(0.25)
Q3 = dataset_reduced[col].quantile(0.75)
IQR = Q3 - Q1
outliers_url_len = dataset_reduced[(dataset_reduced[col] < (Q1 - 1.5 * IQR)) | (dataset_reduced[col] > (Q3 + 1.5 * IQR))]

print(f"\n{col.upper()} Outliers Analysis:")
print(f"Total outliers: {outliers_url_len.shape[0]}")
print(f"Percentage of dataset: {(outliers_url_len.shape[0] / dataset.shape[0] * 100):.2f}%\n")

# Class distribution in outliers
for class_num in range(4):
    class_count = (outliers_url_len['label'] == class_num).sum()
    class_percentage = (class_count / outliers_url_len.shape[0] * 100)
    class_names_dict = {0: 'Benign', 1: 'Defacement', 2: 'Phishing', 3: 'Malware'}

    print(f"Class {class_num} ({class_names_dict[class_num]}): {class_count:6d} samples | {class_percentage:5.2f}% of outliers")

# Get outliers for letters
col = 'letters'
Q1 = dataset_reduced[col].quantile(0.25)
Q3 = dataset_reduced[col].quantile(0.75)
IQR = Q3 - Q1
outliers_letters = dataset_reduced[(dataset[col] < (Q1 - 1.5 * IQR)) | (dataset_reduced[col] > (Q3 + 1.5 * IQR))]

print(f"\n{col.upper()} Outliers Analysis:")
print(f"Total outliers: {outliers_letters.shape[0]}")
print(f"Percentage of dataset: {(outliers_letters.shape[0] / dataset.shape[0] * 100):.2f}%\n")

# Class distribution in outliers
for class_num in range(4):
    class_count = (outliers_letters['label'] == class_num).sum()
    class_percentage = (class_count / outliers_letters.shape[0] * 100)
    class_names_dict = {0: 'Benign', 1: 'Defacement', 2: 'Phishing', 3: 'Malware'}

    print(f"Class {class_num} ({class_names_dict[class_num]}): {class_count:6d} samples | {class_percentage:5.2f}% of outliers")

# Compare with overall distribution
print("\n" + "="*60)
print("COMPARISON WITH OVERALL DATASET DISTRIBUTION")
print("="*60)
for class_num in range(4):
    overall_pct = (dataset_reduced['label'] == class_num).sum() / dataset_reduced.shape[0] * 100
    outlier_pct = (outliers_url_len['label'] == class_num).sum() / outliers_url_len.shape[0] * 100
    class_names_dict = {0: 'Benign', 1: 'Defacement', 2: 'Phishing', 3: 'Malware'}

    print(f"{class_names_dict[class_num]:12s}: Overall={overall_pct:5.2f}% | In url_len outliers={outlier_pct:5.2f}%")

In [None]:
# remove outliers both on url_length and letter count.

#url_len IQR bounds
col = "url_len"
Q1_url = dataset_reduced[col].quantile(0.25)
Q3_url = dataset_reduced[col].quantile(0.75)
IQR_url = Q3_url - Q1_url
lower_url = Q1_url - 1.5 * IQR_url
upper_url = Q3_url + 1.5 * IQR_url

# letters IQR bounds
col = "letters"
Q1_let = dataset_reduced[col].quantile(0.25)
Q3_let = dataset_reduced[col].quantile(0.75)
IQR_let = Q3_let - Q1_let
lower_let = Q1_let - 1.5 * IQR_let
upper_let = Q3_let + 1.5 * IQR_let

# Mask of non-outliers in both columns
mask_url_ok = (dataset_reduced["url_len"] >= lower_url) & (dataset_reduced["url_len"] <= upper_url)
mask_let_ok = (dataset_reduced["letters"] >= lower_let) & (dataset_reduced["letters"] <= upper_let)

final_dataset = dataset_reduced[mask_url_ok & mask_let_ok].copy()

print("Original reduced size:", len(dataset_reduced))
print("After removing outliers:", len(final_dataset))
print("Rows removed:", len(dataset_reduced) - len(final_dataset))


In [None]:
# Show all columns in the DataFrame
pd.set_option('display.max_columns', None)
display(final_dataset.head())

# Data Preproccessing

In [None]:
X = final_dataset.drop(columns=["label"])
y = final_dataset["label"]

# we use a randomstate = 3 for consistancy though the assignment
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [None]:
# normalization / scaling

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
print(x_train_scaled.shape)
print(x_test_scaled.shape)
print(y_test.shape)
print(y_train.shape)

In [None]:
print(x_train_scaled.mean(axis=0))
print("\n")
print(x_train_scaled.std(axis=0)) # Standard diviation for each feature

# Evaluation of classification methods
- in this section we compare three classifiers. Random Forest, Support Vector Machine and Naives Bayes.
- We will collect all relevant measures from above classifiers. For each classfier we will train a final model and compare with each other.
- to keep dual = false is important because we have more samples then features.

In [None]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=50, n_jobs=2, max_depth=30),
    "Linear SVM": LinearSVC(random_state=42, dual=False, max_iter=10000),
    "Gaussian Naive Bayes": GaussianNB()
}

f1_scores = {}
confusion_predictions = {}  #predictions for confusion matrix
confusion_matrices = {}
training_times = {}

In [None]:
# here we perform all evaluations at the same time

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print(f"{'='*50}")

    start_time = time.time()
    model.fit(x_train_scaled, y_train)
    y_pred = model.predict(x_test_scaled)
    training_time = time.time() - start_time
    training_times[name] = training_time

    print(f"✓ {name} trained in {training_time:.2f} seconds")

    confusion_predictions[name] = y_pred
    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices[name] = cm

    f1 = f1_score(y_test, y_pred, average='weighted')
    f1_scores[name] = f1
    print(f"✓ F1 Score: {f1:.4f}")

# Visualize Results

In [None]:
# Visualize F1 score

f1_df = pd.DataFrame(f1_scores.items(), columns=['Algorithm', 'F1_Score'])
f1_df['F1_Score (%)'] = f1_df['F1_Score'] * 100
f1_df = f1_df.sort_values(by='F1_Score', ascending=False).reset_index(drop=True)

plt.figure(figsize=(9, 6))
sns.barplot(x='Algorithm', y='F1_Score (%)', data=f1_df, palette='magma')
plt.title('Predictive Performance Comparison (F1-score)')
plt.xlabel('Algorithm')
plt.ylabel('F1-score (%)')
plt.ylim(f1_df['F1_Score (%)'].min() - 1, 100)

plt.show()

In [None]:
# Visualize confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

class_names = ['Benign', 'Defacement', 'Phishing', 'Malware']

for idx, (name, cm) in enumerate(confusion_matrices.items()):
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(ax=axes[idx], cmap='Blues', values_format='d')
    axes[idx].set_title(f'{name}\nConfusion Matrix')

plt.tight_layout()
plt.show()

In [None]:
# Simplified per-class metrics display
for name, cm in confusion_matrices.items():
    print(f"\n{'='*60}")
    print(f"{name}")
    print('='*60)

    for i, class_name in enumerate(class_names):
        tp = cm[i, i]
        fp = cm[:, i].sum() - tp
        fn = cm[i, :].sum() - tp

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        print(f"{class_name:12s}: Precision={precision*100:5.2f}% | Recall={recall*100:5.2f}% | F1={f1*100:5.2f}%")

# Train on total dataset using Random Forest
    - we have evaluated all the classifiers using only 25% of the total dataset, we will now create the final model with Random Forest using the whole dataset. We will perform the same data cleaning steps as before.

In [None]:
# check z-score outliers

# Check for outliers in url_len and letters
col = 'url_len'
Q1 = dataset[col].quantile(0.25)
Q3 = dataset[col].quantile(0.75)
IQR = Q3 - Q1
outliers_url_len = dataset[(dataset[col] < (Q1 - 1.5 * IQR)) | (dataset[col] > (Q3 + 1.5 * IQR))]
print(f"Outliers in '{col}': {outliers_url_len.shape[0]}")

col = 'letters'
Q1 = dataset[col].quantile(0.25)
Q3 = dataset[col].quantile(0.75)
IQR = Q3 - Q1
outliers_letters = dataset[(dataset[col] < (Q1 - 1.5 * IQR)) | (dataset[col] > (Q3 + 1.5 * IQR))]
print(f"Outliers in '{col}': {outliers_letters.shape[0]}")


# remove outliers both on url_length and letter count.

#url_len IQR bounds
col = "url_len"
Q1_url = dataset[col].quantile(0.25)
Q3_url = dataset[col].quantile(0.75)
IQR_url = Q3_url - Q1_url
lower_url = Q1_url - 1.5 * IQR_url
upper_url = Q3_url + 1.5 * IQR_url

# letters IQR bounds
col = "letters"
Q1_let = dataset[col].quantile(0.25)
Q3_let = dataset[col].quantile(0.75)
IQR_let = Q3_let - Q1_let
lower_let = Q1_let - 1.5 * IQR_let
upper_let = Q3_let + 1.5 * IQR_let

# Mask of non-outliers in both columns
mask_url_ok = (dataset["url_len"] >= lower_url) & (dataset["url_len"] <= upper_url)
mask_let_ok = (dataset["letters"] >= lower_let) & (dataset["letters"] <= upper_let)

total_cleaned_dataset = dataset[mask_url_ok & mask_let_ok].copy()

print("Original reduced size:", len(dataset))
print("After removing outliers:", len(total_cleaned_dataset))
print("Rows removed:", len(dataset) - len(total_cleaned_dataset))


In [None]:
X = total_cleaned_dataset.drop(columns=["label"])
y = total_cleaned_dataset["label"]

# we use a randomstate = 3 for consistancy though the assignment
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [None]:
# normalization / scaling

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [None]:
# training final model

rfc = RandomForestClassifier(random_state=42, n_estimators=50, n_jobs=2, max_depth=30)

rfc.fit(x_train_scaled, y_train)
y_pred = rfc.predict(x_test_scaled)

f1 = f1_score(y_test, y_pred, average='weighted')


print(f"Final models F1 Score: {f1:.4f}")


In [None]:
cm_final = confusion_matrix(y_test, y_pred)

print("\n" + "="*60)
print("FINAL MODEL - Per-Class Performance")
print("="*60)

class_names = ['Benign', 'Defacement', 'Phishing', 'Malware']

for i, class_name in enumerate(class_names):
    tp = cm_final[i, i]
    fp = cm_final[:, i].sum() - tp
    fn = cm_final[i, :].sum() - tp

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_class = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"{class_name:12s}: Precision={precision*100:5.2f}% | Recall={recall*100:5.2f}% | F1={f1_class*100:5.2f}%")

# Visualize confusion matrix for final model
print("\n" + "="*60)
print("Confusion Matrix Visualization")
print("="*60)

plt.figure(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm_final, display_labels=class_names)
disp.plot(cmap='Blues', values_format='d')
plt.title('Random Forest - Final Model\nConfusion Matrix')
plt.tight_layout()
plt.show()

Refrences
https://www.kaggle.com/datasets/moutasmtamimi/malicious-url-detection-dataset-enhanced-2026
https://www.kaggle.com/datasets/nhutrinhanna/malicious-and-benign-urls-datasets