<a href="https://colab.research.google.com/github/APruner-23/Malicious_URL_Detection/blob/main/Deep_Neural_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Neural Network for detecting Malicious URLs

In this Colab Notebook we create a model with Tensorflow and evaluate its performances on detecting Malicious URLs


In [None]:
# Imports
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset csv file located in the Github Repository
# https://github.com/APruner-23/Malicious_URL_Detection.git

dataset_file_link = 'https://raw.githubusercontent.com/APruner-23/Malicious_URL_Detection/main/updated_malicious_phish_with_tld.csv'
urldata = pd.read_csv(dataset_file_link)

In [None]:
# Check the first 5 elements in the dataset
urldata.head()

Unnamed: 0,url,type,url_length,hostname_length,path_length,fd_length,tld_dangerosity,count-,count@,count?,...,count.,count=,count-http,count-https,count-www,count-digits,count-letters,count_dir,use_of_ip,short_url
0,br-icloud.com.br,phishing,16,0,16,0,3.0,1,0,0,...,2,0,0,0,0,0,13,0,1,1
1,mp3raid.com/music/krizz_kaliko.html,benign,35,0,35,5,3.0,0,0,0,...,2,0,0,0,0,1,29,2,1,1
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,0,31,7,3.0,0,0,0,...,2,0,0,0,0,1,25,3,1,1
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,21,10,9,2.0,1,0,1,...,3,4,1,0,1,7,63,1,1,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,23,10,9,3.0,1,0,1,...,2,3,1,0,0,22,199,1,1,1


In [None]:
print(urldata['type'].unique())

['phishing' 'benign' 'defacement' 'malware']


In [None]:
urldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651191 entries, 0 to 651190
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   url              651191 non-null  object 
 1   type             651191 non-null  object 
 2   url_length       651191 non-null  int64  
 3   hostname_length  651191 non-null  int64  
 4   path_length      651191 non-null  int64  
 5   fd_length        651191 non-null  int64  
 6   tld_dangerosity  651191 non-null  float64
 7   count-           651191 non-null  int64  
 8   count@           651191 non-null  int64  
 9   count?           651191 non-null  int64  
 10  count%           651191 non-null  int64  
 11  count.           651191 non-null  int64  
 12  count=           651191 non-null  int64  
 13  count-http       651191 non-null  int64  
 14  count-https      651191 non-null  int64  
 15  count-www        651191 non-null  int64  
 16  count-digits     651191 non-null  int6

In [None]:
class_dictionary = {
    'phishing': 0,
    'benign': 1,
    'defacement': 2,
    'malware': 3,
}

In [None]:
# After dataset split, remember to also split those 2 arrays
# Creation of y array
y = urldata['type'].map(class_dictionary).values
# Convert to a numpy array
#y = np.array(labels)
urls = [url for url in urldata['url']]

# Check % of each class
# Count the total number of rows
total_rows = len(urldata)

phishing_count = urldata[urldata['type'] == 'phishing'].shape[0]
benign_count = urldata[urldata['type'] == 'benign'].shape[0]
defacement_count = urldata[urldata['type'] == 'defacement'].shape[0]
malware_count = urldata[urldata['type'] == 'malware'].shape[0]

# Calculate the percentages
phishing_percentage = (phishing_count / total_rows) * 100
benign_percentage = (benign_count / total_rows) * 100
defacement_percentage = (defacement_count / total_rows) * 100
malware_percentage = (malware_count / total_rows) * 100

print(f"Number of phishing URLs: {phishing_count}")
print(f"Percentage of phishing URLs: {phishing_percentage:.2f}%")
print(f"Number of benign URLs: {benign_count}")
print(f"Percentage of benign URLs: {benign_percentage:.2f}%")
print(f"Number of defacement URLs: {defacement_count}")
print(f"Percentage of defacement URLs: {defacement_percentage:.2f}%")
print(f"Number of malware URLs: {malware_count}")
print(f"Percentage of malware URLs: {malware_percentage:.2f}%")

# Now i can drop url and type
urldata = urldata.drop(columns=['type', 'url'])

# Create x matrix
x = urldata.values

Number of phishing URLs: 94111
Percentage of phishing URLs: 14.45%
Number of benign URLs: 428103
Percentage of benign URLs: 65.74%
Number of defacement URLs: 96457
Percentage of defacement URLs: 14.81%
Number of malware URLs: 32520
Percentage of malware URLs: 4.99%


In [None]:
# Check if everything went well during data manipulation

# Should be numpy.ndarray
print("X type: ", type(x))
print('Y type: ', type(y))
print("Urls list type: ", type(urls))

# Should be (651191, 19) and (651191,)
print("X Shape: ", x.shape)
print("Y Shape: ", y.shape)
print("Urls length: ", len(urls))

# Check if they are correct based on the urldata.head()
print("Y: ", y[0:5])
print("X: ", x[0:5])
print("Urls: ", urls[0:5])

X type:  <class 'numpy.ndarray'>
Y type:  <class 'numpy.ndarray'>
Urls list type:  <class 'list'>
X Shape:  (651191, 19)
Y Shape:  (651191,)
Urls length:  651191
Y:  [0 1 1 2 2]
X:  [[ 16.   0.  16.   0.   3.   1.   0.   0.   0.   2.   0.   0.   0.   0.
    0.  13.   0.   1.   1.]
 [ 35.   0.  35.   5.   3.   0.   0.   0.   0.   2.   0.   0.   0.   0.
    1.  29.   2.   1.   1.]
 [ 31.   0.  31.   7.   3.   0.   0.   0.   0.   2.   0.   0.   0.   0.
    1.  25.   3.   1.   1.]
 [ 88.  21.  10.   9.   2.   1.   0.   1.   0.   3.   4.   1.   0.   1.
    7.  63.   1.   1.   1.]
 [235.  23.  10.   9.   3.   1.   0.   1.   0.   2.   3.   1.   0.   0.
   22. 199.   1.   1.   1.]]
Urls:  ['br-icloud.com.br', 'mp3raid.com/music/krizz_kaliko.html', 'bopsecrets.org/rexroth/cr/1.htm', 'http://www.garage-pirenne.be/index.php?option=com_content&view=article&id=70&vsig70_0=15', 'http://adventure-nicaragua.net/index.php?option=com_mailto&tmpl=component&link=aHR0cDovL2FkdmVudHVyZS1uaWNhcmFndWEubmV0L2l

# Dataset Split

Split dataset into Train, Test and Validation
Percentages of split: 70%, 15%, 15%

In [None]:
# First, split the data into training and a combined test/validation set (temporary dataset)
x_train, x_temp, y_train, y_temp, url_train, url_temp = train_test_split(
    x, y, urls, test_size=0.3)

# Then, split the temporary set into separate test and validation sets
x_test, x_val, y_test, y_val, url_test, url_val = train_test_split(
    x_temp, y_temp, url_temp, test_size=0.5)

# Print the shapes to verify the splits
print("Training set:", x_train.shape, y_train.shape, len(url_train))
print("Test set:", x_test.shape, y_test.shape, len(url_test))
print("Validation set:", x_val.shape, y_val.shape, len(url_val))

Training set: (455833, 19) (455833,) 455833
Test set: (97679, 19) (97679,) 97679
Validation set: (97679, 19) (97679,) 97679


# Training of the NN

In [None]:
# Convert the labels to categorical
y_train = to_categorical(y_train, num_classes=4)
y_val = to_categorical(y_val, num_classes=4)
y_test = to_categorical(y_test, num_classes=4)

# Define the neural network model
# model = Sequential([
#     Dense(64, input_shape=(x_train.shape[1],), activation='relu'),
#     Dropout(0.5),
#     Dense(64, activation='relu'),
#     Dropout(0.5),
#     Dense(32, activation='relu'),
#     Dropout(0.5),
#     Dense(4, activation='softmax')
# ])

# Incresed layer model
# model = Sequential([
#     Dense(128, input_shape=(x_train.shape[1],), activation='relu'),
#     BatchNormalization(),
#     Dropout(0.5),
#     Dense(128, activation='relu'),
#     BatchNormalization(),
#     Dropout(0.5),
#     Dense(64, activation='relu'),
#     BatchNormalization(),
#     Dropout(0.5),
#     Dense(32, activation='relu'),
#     BatchNormalization(),
#     Dropout(0.5),
#     Dense(4, activation='softmax')
# ])

# Final model
model = Sequential([
    Dense(256, input_shape=(x_train.shape[1],), activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(4, activation='softmax')
])

optimizer = Adam(learning_rate=0.001)

# Compile the model
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)

# Train the model
history = model.fit(x_train, y_train, epochs=100, batch_size=64, validation_data=(x_val, y_val), callbacks=[early_stopping, reduce_lr])

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f'Test accuracy: {test_accuracy}')

model.summary()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# Generate classification report
y_pred = model.predict(x_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)
print(classification_report(y_true, y_pred_classes, target_names=class_dictionary.keys()))

# Plot training & validation accuracy values
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()

# Plot confusion matrix as a heatmap
conf_matrix = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_dictionary.keys(), yticklabels=class_dictionary.keys())
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

