<a href="https://colab.research.google.com/github/APruner-23/Malicious_URL_Detection/blob/main/RandomForest_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Random Forest for classifiying malicious URLs**


In [1]:
#imports

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
# Load the dataset csv file located in the Github Repository
# https://github.com/APruner-23/Malicious_URL_Detection.git

dataset_file_link = 'https://raw.githubusercontent.com/APruner-23/Malicious_URL_Detection/main/updated_malicious_phish_with_tld.csv'
urldata = pd.read_csv(dataset_file_link)

In [3]:
class_dictionary = {
    'phishing': 0,
    'benign': 1,
    'defacement': 2,
    'malware': 3,
}

In [4]:
# After dataset split, remember to also split those 2 arrays
# Creation of y array
y = urldata['type'].map(class_dictionary).values
urls = [url for url in urldata['url']]

#Creation of a second y array for later
urldata2 = urldata.copy()
y2 = urldata2['type'].map(class_dictionary).values

#drop url and type
urldata = urldata.drop(columns=['type', 'url'])
urldata2 = urldata2.drop(columns=['type', 'url'])

# Create x matrix
x = urldata.values

In [5]:
# Splitting into separate test and validation sets
x_train, x_test, y_train, y_test, url_train, url_test = train_test_split(
    x, y, urls, test_size=0.2)

# Print the shapes to verify the splits
print("Training set:", x_train.shape, y_train.shape, len(url_train))
print("Test set:", x_test.shape, y_test.shape, len(url_test))


Training set: (520952, 19) (520952,) 520952
Test set: (130239, 19) (130239,) 130239


In [6]:
# Initialise RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100)

# Train the model
rf_model.fit(x_train, y_train)

# Prediction
y_test_pred = rf_model.predict(x_test)


# Evaluation of the prediction
print("Classification report for the testing set :")
print(classification_report(y_test, y_test_pred, target_names=class_dictionary.keys()))

print("Confusion matrix for the testing set :")
print(confusion_matrix(y_test, y_test_pred))



Classification report for the testing set :
              precision    recall  f1-score   support

    phishing       0.90      0.86      0.88     18630
      benign       0.97      0.98      0.98     85724
  defacement       0.98      0.99      0.99     19302
     malware       0.99      0.94      0.96      6583

    accuracy                           0.97    130239
   macro avg       0.96      0.95      0.95    130239
weighted avg       0.97      0.97      0.97    130239

Confusion matrix for the testing set :
[[16097  2154   310    69]
 [ 1431 84285     4     4]
 [   94    17 19174    17]
 [  302    42    53  6186]]


In [29]:
urldata.head()

Unnamed: 0,url_length,hostname_length,path_length,fd_length,tld_dangerosity,count-,count@,count?,count%,count.,count=,count-http,count-https,count-www,count-digits,count-letters,count_dir,use_of_ip,short_url
0,16,0,16,0,3.0,1,0,0,0,2,0,0,0,0,0,13,0,1,1
1,35,0,35,5,3.0,0,0,0,0,2,0,0,0,0,1,29,2,1,1
2,31,0,31,7,3.0,0,0,0,0,2,0,0,0,0,1,25,3,1,1
3,88,21,10,9,2.0,1,0,1,0,3,4,1,0,1,7,63,1,1,1
4,235,23,10,9,3.0,1,0,1,0,2,3,1,0,0,22,199,1,1,1


# Prediction using only the top 6 features

In [7]:
# Cancel every features exepct the top 6 so except count-http, hostname_length, count-www, tld_dangerosity, count= and use_of_ip

urldata2 = urldata2.drop(columns=['count?', 'path_length','count-letters','count-https','url_length','count.', 'count-','count_dir','count-digits','count%','fd_length', 'short_url','count@'])



# Create x matrix
x = urldata2.values

# Then, split the combined test/validation set into separate test and validation sets
x_train, x_test, y_train, y_test, url_train, url_test = train_test_split(
    x, y2, urls, test_size=0.2)

# Print the shapes to verify the splits
print("Training set:", x_train.shape, y_train.shape, len(url_train))
print("Test set:", x_test.shape, y_test.shape, len(url_test))


# Initialise RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100)

# Train the model
rf_model.fit(x_train, y_train)

# Prediction
y_test_pred = rf_model.predict(x_test)


# Evaluation of the prediction
print("Classification report for the testing set :")
print(classification_report(y_test, y_test_pred, target_names=class_dictionary.keys()))

print("Confusion matrix for the testing set :")
print(confusion_matrix(y_test, y_test_pred))



Training set: (520952, 6) (520952,) 520952
Test set: (130239, 6) (130239,) 130239
Classification report for the testing set :
              precision    recall  f1-score   support

    phishing       0.75      0.41      0.53     18871
      benign       0.88      0.96      0.92     85584
  defacement       0.87      0.88      0.87     19301
     malware       0.71      0.79      0.75      6483

    accuracy                           0.86    130239
   macro avg       0.81      0.76      0.77    130239
weighted avg       0.85      0.86      0.85    130239

Confusion matrix for the testing set :
[[ 7772  8832  1703   564]
 [ 1984 82390   351   859]
 [  351  1368 16934   648]
 [  189   732   433  5129]]
