In [7]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import copy
import cv2
import keras
from string import printable
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Flatten, LSTM, SimpleRNN, Embedding, Lambda
from keras.models import Model, Sequential
from keras.datasets import fashion_mnist
from keras import backend as K
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.optimizers import SGD, RMSprop
from keras.utils import np_utils
from keras.regularizers import l2
from keras.layers.convolutional import Conv1D, Conv2D, MaxPooling2D, ZeroPadding2D, AveragePooling2D
from keras.callbacks import EarlyStopping
from keras.preprocessing.image import ImageDataGenerator
from keras.layers.normalization import BatchNormalization
from PIL import Image
from keras.applications import VGG16
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.optimizers import RMSprop
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from keras import callbacks
from keras.callbacks import TensorBoard
from keras.preprocessing.sequence import pad_sequences

In [8]:
# read data
benign_list = []
malicious_list = []
with open("benign-urls.txt") as files:
    lines = files.readlines()
    for line in lines:
        if not line.startswith("#"):
            benign_list.append(line.strip())
with open("malicious-urls.txt") as files:
    lines = files.readlines()
    for line in lines:
        malicious_list.append(line.strip())

In [9]:
# combine and split data
total = benign_list + malicious_list
max_len = 0
url_result = []
char_set = set()
for e in list(total):
    urls = []
    for c in e:
        urls.append(printable.index(c))
        for cc in c:
            char_set.add(cc)
    len_urls = len(urls)
    max_len = max(max_len, len_urls)
    url_result.append(urls)

chars = sorted(list(char_set))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
X = pad_sequences(url_result, maxlen=max_len)
y = np.concatenate([np.ones(len(malicious_list)), np.zeros(len(benign_list))])
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [10]:
#3.1

In [11]:
X_train_rnn = np.zeros((len(X_train), max_len, len(chars)), dtype=np.uint8)
X_test_rnn = np.zeros((len(X_test), max_len, len(chars)), dtype=np.uint8)
for i, seq in enumerate(list(X_train)):
    for t, char in enumerate(seq):
        X_train_rnn[i, t, char_indices[char]] = 1
for i, seq in enumerate(list(X_test)):
    for t, char in enumerate(seq):
        X_test_rnn[i, t, char_indices[char]] = 1

In [12]:
model_rnn = Sequential()
model_rnn.add(SimpleRNN(32, input_shape=(86, len(chars))))
model_rnn.add(Dense(1, activation="softmax"))
model_rnn.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model_rnn.summary()

In [None]:
model_rnn.fit(X_train_rnn, y_train, epochs=10, batch_size=128)

In [None]:
#3.2

In [None]:
model_cnn = Sequential()
model_cnn.add(Embedding(len(chars), 100, input_length=86))
model_cnn.add(Dropout(0.3))
model_cnn.add(Conv1D(128, kernel_size=2, activation='relu', border_mode='same'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(1, activation='sigmoid'))
model_cnn.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=0.01))
model_cnn.summary()

In [None]:
X_train_cnn = np.zeros((len(X_train), max_len), dtype=np.uint8)
X_test_cnn = np.zeros((len(X_test), max_len), dtype=np.uint8)
for i, seq in enumerate(list(X_train)):
    for t, char in enumerate(seq):
        X_train_cnn[i, t, char_indices[char]] = 1
for i, seq in enumerate(list(X_test)):
    for t, char in enumerate(seq):
        X_test_cnn[i, t, char_indices[char]] = 1

In [None]:
model_cnn.fit(X_train_cnn, y_train, epochs=20, batch_size=128)

In [None]:
#3.3

In [6]:
rnn_result = model_rnn.predict(X_test_rnn)
(rnn_fp, rnn_tp, rnn_threshold) = roc_curve(y_test, rnn_result)
rnn_auc = auc(rnn_fp, rnn_tp)

cnn_result = model_cnn.predict(X_test)
(cnn_fp, cnn_tp, cnn_threshold) = roc_curve(y_test, cnn_result)
cnn_auc = auc(cnn_fp, cnn_tp)

plt.figure()
plt.title('ROC')
plt.xlabel('FP')
plt.ylabel('TP')
plt.plot([0, 1], [0, 1], linestyle='dashed')
plt.plot(rnn_fp, rnn_tp, label='RNN ROC area = %0.4f' % rnn_auc)
plt.plot(cnn_fp, cnn_tp, label='CNN ROC area = %0.4f' % cnn_auc)
plt.legend(loc="best")
plt.show()