# **Malicious URL Detect**

#### malicious-url-detection-with-ML

malicious url detection with machine learning

In [1]:
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np

from string import printable
from sklearn import model_selection
from sklearn.metrics import classification_report

# from keraspreprocessing import sequence
from keras.models import model_from_json

In [7]:
pip install keras-preprocessing

Collecting keras-preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
     ---------------------------------------- 42.6/42.6 kB 2.0 MB/s eta 0:00:00
Installing collected packages: keras-preprocessing
Successfully installed keras-preprocessing-1.1.2
Note: you may need to restart the kernel to use updated packages.


In [10]:
from keras_preprocessing.sequence import pad_sequences

In [2]:
def print_layers_dims(model):
    l_layers = model.layers
    # Note None is ALWAYS batch_size
    for i in range(len(l_layers)):
        print(l_layers[i])
        print('Input Shape: ', l_layers[i].input_shape,
              'Output Shape: ', l_layers[i].output_shape)

In [3]:
def save_model(model, fileModelJSON, fileWeights):
    # have h5py installed
    if Path(fileModelJSON).is_file():
        os.remove(fileModelJSON)
    json_string = model.to_json()
    with open(fileModelJSON, 'w') as f:
        json.dump(json_string, f)

    if Path(fileWeights).is_file():
        os.remove(fileWeights)
    model.save_weights(fileWeights)


def load_model(fileModelJSON, fileWeights):
    with open(fileModelJSON, 'r') as f:
        model_json = json.load(f)
        model = model_from_json(model_json)

    model.load_weights(fileWeights)
    return model

## read data

In [22]:
def read_data():
    data_file_path = r"C:\MyCode\Project\NLP+cyber_sec\NLP4CyberSecurity-main\cache\malicious_url\urldata.csv\urldata.csv"
    df = pd.read_csv(data_file_path)

    # Step 1: Convert raw URL string in list of lists where characters that are contained in "printable" are stored encoded as integer
    url_int_tokens = [
        [printable.index(x) + 1 for x in url if x in printable] for url in df.url]

    # Step 2: Cut URL string at max_len or pad with zeros if shorter
    max_len = 75
    X = pad_sequences(url_int_tokens, maxlen=max_len)

    # Step 3: Extract labels form df to numpy array
    target = np.array(df.result)

    print('Matrix dimensions of X: ', X.shape,
          'Vector dimension of target: ', target.shape)

    X_train, X_test, target_train, target_test = model_selection.train_test_split(
        X, target, test_size=0.25, random_state=33)

    return X_train, X_test, target_train, target_test

In [14]:
data_file_path = r"C:\MyCode\Project\NLP+cyber_sec\NLP4CyberSecurity-main\cache\malicious_url\urldata.csv\urldata.csv"
df = pd.read_csv(data_file_path)
df

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0
...,...,...,...,...
450171,450171,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious,1
450172,450172,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1
450173,450173,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1
450174,450174,http://atualizapj.com/,malicious,1


In [17]:
df.columns.str.match("Unnamed")
df.loc[:,~df.columns.str.match("Unnamed")]

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0
...,...,...,...
450171,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious,1
450172,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1
450173,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1
450174,http://atualizapj.com/,malicious,1


In [21]:
print(df.label.value_counts())


benign       345738
malicious    104438
Name: label, dtype: int64


In [23]:
X_train, X_test, target_train, target_test = read_data()
X_train

Matrix dimensions of X:  (450176, 75) Vector dimension of target:  (450176,)


array([[ 0,  0,  0, ..., 78, 77, 81],
       [ 0,  0,  0, ..., 76, 14, 25],
       [ 0,  0,  0, ..., 25, 34, 77],
       ...,
       [ 0,  0,  0, ..., 26, 24, 17],
       [ 0,  0,  0, ..., 25, 34, 77],
       [ 0,  0,  0, ...,  3,  8,  2]])

In [24]:
from models.cnn import ConvFully
from models.rnn import SimpleLSTM
from models.cnn_rnn import ConvLSTM
epochs_num = 10
batch_size = 32 #8

## **RNN**

In [None]:
model_name = "simple_lstm"
model = SimpleLSTM().build_model()
model.fit(X_train, target_train,
            epochs=epochs_num, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')

# print_layers_dims(model)
# save model
json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
save_model(model, json_file_path, weight_file_path)


In [8]:
from utils import eval_utils
model_name = "simple_lstm"
json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
model = load_model(json_file_path, weight_file_path)
y_pred = model.predict(X_test)
# print(y_pred)
pred = eval_utils.to_y(y_pred)
# print(pred)
# print(classification_report(target_test, pred, digits=5))
eval_utils.evaluate_result(target_test, pred)

Accuracy Score is:  0.8655441478439425
Precision Score is : 0.8579050828418984
Recall Score is : 0.8767578205075642
F1 Score:  0.8672290036092299
AUC Score:  0.8655252346603806


## **CNN**

In [25]:
model_name = "conv_fully"

model = ConvFully().build_model()
model.fit(X_train, target_train,
            epochs=epochs_num, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')

pred = model.predict(X_test)
print(classification_report(target_test, pred, digits=5))

# print_layers_dims(model)
# save model
json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
save_model(model, json_file_path, weight_file_path)


  super().__init__(name, **kwargs)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Final Cross-Validation Accuracy 0.987382709980011 



ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [9]:
from utils import eval_utils
model_name = "conv_fully"
json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
model = load_model(json_file_path, weight_file_path)
y_pred = model.predict(X_test)
# print(y_pred)
pred = eval_utils.to_y(y_pred)
# print(pred)
# print(classification_report(target_test, pred, digits=5))
eval_utils.evaluate_result(target_test, pred)



Accuracy Score is:  0.8379671457905544
Precision Score is : 0.8431494883953082
Recall Score is : 0.831085236357673
F1 Score:  0.8370738958974254
AUC Score:  0.8379787529437384


## **Conv_LSTM**

In [None]:
model_name = "conv_lstm"

model = ConvLSTM().build_model()
model.fit(X_train, target_train,
            epochs=epochs_num, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')

pred = model.predict(X_test)
print(classification_report(target_test, pred, digits=5))

# print_layers_dims(model)
# save model
json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
save_model(model, json_file_path, weight_file_path)

In [10]:
from utils import eval_utils
model_name = "conv_lstm"
json_file_path = os.path.join("cache/malicious_url", model_name + ".json")
weight_file_path = os.path.join("cache/malicious_url", model_name + ".h5")
model = load_model(json_file_path, weight_file_path)
y_pred = model.predict(X_test)
# print(y_pred)
pred = eval_utils.to_y(y_pred)
# print(pred)
# print(classification_report(target_test, pred, digits=5))
eval_utils.evaluate_result(target_test, pred)

Accuracy Score is:  0.9242505133470226
Precision Score is : 0.9288969917958068
Recall Score is : 0.9191095076052642
F1 Score:  0.92397733127254
AUC Score:  0.9242591842604873
