In [23]:
# Import the dataset saved on the google drive
from google.colab import drive

# Data management
import pandas as pd
import numpy as np

# For S-10-fold CV
from sklearn.model_selection import StratifiedKFold

# Random Forest model
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import recall_score, precision_score
from keras.metrics import BinaryAccuracy, CategoricalAccuracy, Precision, Recall

# Keras DNN Model
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout
from keras.regularizers import l2
from keras.utils import to_categorical, normalize

# Fast.ai DNN Model
from fastai.tabular import *

# For timing of the models across different runtimes (CPU, GPU, TPU)
from time import time

# Uncomment when using GPU
"""# Set up for GPU Usage
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))"""

%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

print('Imports complete.')

Tensorflow version 2.3.0
Running on TPU  ['10.20.57.18:8470']




INFO:tensorflow:Initializing the TPU system: grpc://10.20.57.18:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.20.57.18:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


Imports complete.


In [24]:
# Uncomment when using GPU
"""# Test the GPU
%tensorflow_version 2.x
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))"""

'# Test the GPU\n%tensorflow_version 2.x\nimport tensorflow as tf\nimport timeit\n\ndevice_name = tf.test.gpu_device_name()\nif device_name != \'/device:GPU:0\':\n  print(\n      \'\n\nThis error most likely means that this notebook is not \'\n      \'configured to use a GPU.  Change this in Notebook Settings via the \'\n      \'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n\')\n  raise SystemError(\'GPU device not found\')\n\ndef cpu():\n  with tf.device(\'/cpu:0\'):\n    random_image_cpu = tf.random.normal((100, 100, 100, 3))\n    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)\n    return tf.math.reduce_sum(net_cpu)\n\ndef gpu():\n  with tf.device(\'/device:GPU:0\'):\n    random_image_gpu = tf.random.normal((100, 100, 100, 3))\n    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)\n    return tf.math.reduce_sum(net_gpu)\n  \n# We run each op once to warm up; see: https://stackoverflow.com/a/45067900\ncpu()\ngpu()\n\n# Run the op several times.\nprint(\'Ti

In [25]:
# Set up google drive access
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [26]:
# Import the data
path = '/content/gdrive/My Drive/FinalDataset/'
fille = 'All.csv'
df = pd.read_csv(path + fille)
print('Data Read:')
print(df.head())

Data Read:
   Querylength  domain_token_count  ...  Entropy_Afterpath  URL_Type_obf_Type
0            0                   4  ...               -1.0         Defacement
1            0                   4  ...               -1.0         Defacement
2            0                   4  ...               -1.0         Defacement
3            0                   4  ...               -1.0         Defacement
4            0                   4  ...               -1.0         Defacement

[5 rows x 80 columns]


In [27]:
dep_var = 'URL_Type_obf_Type'

print('There are {} columns and {} rows in the provided data.'.format(len(df.columns), len(df)))

There are 80 columns and 36697 rows in the provided data.


In [28]:
print('Below is the dataset\'s composition')
print(df[dep_var].value_counts())

Below is the dataset's composition
Defacement    7930
benign        7781
phishing      7577
malware       6711
spam          6698
Name: URL_Type_obf_Type, dtype: int64


In [29]:
nans = 0

# This is a kind of stupid way of counting how many NaNs show up within the data
for index, row in df.iterrows():
  for col in df.columns:
    if col == dep_var:
      continue

    #print(row[col], end=' ')
    # If the value is NaN, then mark add one to our counter
    if np.isnan(row[col]):
     nans += 1
  #print('')
print('NaNs detected: {}'.format(nans))

NaNs detected: 19153


In [30]:
# Removes all rows if they contain NaN values
df.dropna(axis='index', inplace=True)

In [31]:
print('There are {} columns and {} rows in the provided data.'.format(len(df.columns), len(df)))

print('Below is the dataset\'s composition')
print(df[dep_var].value_counts())

nans = 0

# This is a kind of stupid way of counting how many NaNs show up within the data
for index, row in df.iterrows():
  for col in df.columns:
    if col == dep_var:
      continue

    #print(row[col], end=' ')
    # If the value is NaN, then mark add one to our counter
    if np.isnan(row[col]):
     nans += 1
  #print('')
print('NaNs detected: {}'.format(nans))

There are 80 columns and 18982 rows in the provided data.
Below is the dataset's composition
spam          5342
malware       4440
phishing      4014
benign        2709
Defacement    2477
Name: URL_Type_obf_Type, dtype: int64
NaNs detected: 0


In [32]:
# Create the X (data) and y (labels)
X = normalize( df.loc[:, df.columns != dep_var] )
y = df[dep_var]

In [33]:
print(X.head())

    Querylength  domain_token_count  ...  Entropy_Extension  Entropy_Afterpath
35     0.000000            0.045198  ...           0.011299          -0.011299
37     0.117281            0.021324  ...           0.004037           0.003994
38     0.121354            0.021105  ...           0.003990           0.003948
39     0.117281            0.021324  ...           0.004000           0.003953
40     0.121354            0.021105  ...           0.003990           0.003948

[5 rows x 79 columns]


In [34]:
print(y.head())

35    Defacement
37    Defacement
38    Defacement
39    Defacement
40    Defacement
Name: URL_Type_obf_Type, dtype: object


In [35]:
# Create the stratified cross validation object
random_state = 0
sss = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
print(sss)

StratifiedKFold(n_splits=10, random_state=0, shuffle=True)


## Multi-Classification Experiments


In [36]:
fold = 0
training_times = {'rf': [], 
                  'keras': [],
                  'fastai': []}

for train_idx, test_idx in sss.split(X, y):
  # Update which fold we are on (this is just for output/usability reasons)
  fold += 1

  # Split the data into the train and testing sets
  X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

  # Initialize the models (not fast.ai since it needs a databunch object)
  rf = RandomForestClassifier(random_state=random_state)
  dnn_keras = Sequential(layers=[
                                 Dense(128, kernel_regularizer=l2(0.001), activation='relu',input_shape=(len(X_train.columns),)),
                                 BatchNormalization(),
                                 Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
                                 BatchNormalization(),
                                 Dense(y_train.nunique(), activation='softmax')
  ])
  dnn_keras.compile(
      optimizer='adam', 
      loss='categorical_crossentropy', 
      metrics=['accuracy', 'Recall', 'Precision'])
  
  print('Training RandomForest model with Fold {}...'.format(fold), end='')
  t0 = time()
  rf.fit(X_train, y_train)
  t1 = time()
  training_times['rf'].append(t1-t0)
  print('done')

  print('Training Keras-TensorFlow DNN model with Fold {}...'.format(fold), end='')
  t0 = time()
  dnn_keras.fit(X_train, pd.get_dummies(y_train), epochs=100, verbose=0, batch_size=1024)
  t1 = time()
  training_times['keras'].append(t1-t0)
  print('done')

  # Initialize and run fast.ai model
  print('Training Fast.ai Fold {}...'.format(fold))
  data_fold = (TabularList.from_df(df, path=path, cont_names=X_train.columns, procs=[Categorify, Normalize])
                     .split_by_idxs(train_idx, test_idx)
                     .label_from_df(cols=dep_var)
                     .databunch())
  dnn_fastai = tabular_learner(data_fold, layers=[200, 100], metrics=accuracy)

  t0 = time()
  dnn_fastai.fit_one_cycle(cyc_len=10, callbacks=None)
  t1 = time()
  training_times['fastai'].append(t1-t0)
  print('Fast ai done')
  break


Training RandomForest model with Fold 1...done
Training Keras-TensorFlow DNN model with Fold 1...done
Training Fast.ai Fold 1...


epoch,train_loss,valid_loss,accuracy,time
0,0.448715,0.342088,0.876777,00:03
1,0.298274,0.271093,0.894155,00:02
2,0.258733,0.233258,0.918378,00:03
3,0.212302,0.173841,0.935756,00:03
4,0.177737,0.144453,0.9505,00:03
5,0.136807,0.122004,0.959452,00:03
6,0.115325,0.106509,0.965245,00:03
7,0.101376,0.090834,0.972091,00:03
8,0.077024,0.086472,0.973144,00:03
9,0.071149,0.086438,0.973144,00:03


Fast ai done


In [37]:
print('model\tfold runtime\ttotal runtime')
print('-'*40)
for model in training_times.keys():
  mean = sum(training_times[model]) / len(training_times[model])
  std = np.std(training_times[model])

  print('{}\t{:.2f}\u00B1{:.2f}s\t{:.2f}s'.format(model, mean, std, sum(training_times[model])))

model	fold runtime	total runtime
----------------------------------------
rf	8.70±0.00s	8.70s
keras	12.16±0.00s	12.16s
fastai	30.96±0.00s	30.96s


In [38]:
# The models currently hold the last fold's values. We will use these models to predict
rf_times = []
keras_times = []
fastai_times = []

for index, row in X_test.iterrows():
  t0 = time()
  dnn_fastai.predict(row)
  t1 = time()
  fastai_times.append(t1-t0)

  row = ( row.to_numpy() ).reshape(1, -1)
  #row = row.reshape(1, -1)
  t0 = time()
  rf.predict(row)
  t1 = time()
  rf_times.append(t1-t0)

  t0 = time()
  dnn_keras.predict(row)
  t1 = time()
  keras_times.append(t1-t0)

rf_avg = np.average(rf_times)
rf_std = np.std(rf_times)

keras_avg = np.average(keras_times)
keras_std = np.std(keras_times)

fastai_avg = np.average(fastai_times)
fastai_std = np.std(fastai_times)

print('Model\tMean Time per Prediction')
print('RF\t{:.2f}\u00B1{:.2f}ms'.format(rf_avg*1000, rf_std*1000))
print('Keras\t{:.2f}\u00B1{:.2f}ms'.format(keras_avg*1000, keras_std*1000))
print('Fastai\t{:.2f}\u00B1{:.2f}ms'.format(fastai_avg*1000, fastai_std*1000))

Model	Mean Time per Prediction
RF	8.02±0.66ms
Keras	41.49±4.32ms
Fastai	54.53±11.05ms


## Binary Classification
This is mostly the same code for the execution of the models, however we have to change the data...

In [39]:
print('There are {} columns and {} rows in the provided data.'.format(len(X.columns)+1, len(X)))

print('Below is the dataset\'s composition')
print(y.value_counts())

There are 80 columns and 18982 rows in the provided data.
Below is the dataset's composition
spam          5342
malware       4440
phishing      4014
benign        2709
Defacement    2477
Name: URL_Type_obf_Type, dtype: int64


In [40]:
# Convert the multiclass problem into a binary classification problem
y = y.map(lambda label : label if label == 'benign' else 'malicious')

In [41]:
print('There are {} columns and {} rows in the provided data.'.format(len(X.columns)+1, len(X)))

print('Below is the dataset\'s composition')
print(y.value_counts())

There are 80 columns and 18982 rows in the provided data.
Below is the dataset's composition
malicious    16273
benign        2709
Name: URL_Type_obf_Type, dtype: int64


In [42]:
fold = 0
training_times = {'rf': [], 
                  'keras': [],
                  'fastai': []}

for train_idx, test_idx in sss.split(X, y):
  # Update which fold we are on (this is just for output/usability reasons)
  fold += 1

  # Split the data into the train and testing sets
  X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

  # Initialize the models (not fast.ai since it needs a databunch object)
  rf = RandomForestClassifier(random_state=random_state)
  dnn_keras = Sequential(layers=[
                                 Dense(128, kernel_regularizer=l2(0.001), activation='relu',input_shape=(len(X_train.columns),)),
                                 BatchNormalization(),
                                 Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
                                 BatchNormalization(),
                                 Dense(y_train.nunique(), activation='softmax')
  ])
  dnn_keras.compile(
      optimizer='adam', 
      loss='categorical_crossentropy', 
      metrics=['accuracy', 'Recall', 'Precision'])
  
  print('Training RandomForest model with Fold {}...'.format(fold), end='')
  t0 = time()
  rf.fit(X_train, y_train)
  t1 = time()
  training_times['rf'].append(t1-t0)
  print('done')

  print('Training Keras-TensorFlow DNN model with Fold {}...'.format(fold), end='')
  t0 = time()
  dnn_keras.fit(X_train, pd.get_dummies(y_train), epochs=100, verbose=0, batch_size=1024)
  t1 = time()
  training_times['keras'].append(t1-t0)
  print('done')

  # Initialize and run fast.ai model
  print('Training Fast.ai Fold {}...'.format(fold))
  data_fold = (TabularList.from_df(df, path=path, cont_names=X_train.columns, procs=[Categorify, Normalize])
                     .split_by_idxs(train_idx, test_idx)
                     .label_from_df(cols=dep_var)
                     .databunch())
  dnn_fastai = tabular_learner(data_fold, layers=[200, 100], metrics=accuracy)

  t0 = time()
  dnn_fastai.fit_one_cycle(cyc_len=10, callbacks=None)
  t1 = time()
  training_times['fastai'].append(t1-t0)
  print('Fast ai done')
  break

Training RandomForest model with Fold 1...done
Training Keras-TensorFlow DNN model with Fold 1...done
Training Fast.ai Fold 1...


epoch,train_loss,valid_loss,accuracy,time
0,0.458214,0.336439,0.890469,00:03
1,0.298138,0.251246,0.910479,00:03
2,0.252527,0.246379,0.909953,00:03
3,0.214199,0.212769,0.931016,00:02
4,0.170784,0.182835,0.939968,00:03
5,0.144108,0.154774,0.95366,00:03
6,0.116634,0.133819,0.958399,00:03
7,0.097174,0.122703,0.962085,00:03
8,0.080235,0.116013,0.962085,00:02
9,0.072025,0.112126,0.964192,00:03


Fast ai done


In [43]:
print('model\tfold runtime\ttotal runtime')
print('-'*40)
for model in training_times.keys():
  mean = sum(training_times[model]) / len(training_times[model])
  std = np.std(training_times[model])

  print('{}\t{:.2f}\u00B1{:.2f}s\t{:.2f}s'.format(model, mean, std, sum(training_times[model])))

model	fold runtime	total runtime
----------------------------------------
rf	7.58±0.00s	7.58s
keras	11.96±0.00s	11.96s
fastai	30.74±0.00s	30.74s


In [44]:
# The models currently hold the last fold's values. We will use these models to predict
rf_times = []
keras_times = []
fastai_times = []

for index, row in X_test.iterrows():
  t0 = time()
  dnn_fastai.predict(row)
  t1 = time()
  fastai_times.append(t1-t0)

  row = ( row.to_numpy() ).reshape(1, -1)
  #row = row.reshape(1, -1)
  t0 = time()
  rf.predict(row)
  t1 = time()
  rf_times.append(t1-t0)

  t0 = time()
  dnn_keras.predict(row)
  t1 = time()
  keras_times.append(t1-t0)

rf_avg = np.average(rf_times)
rf_std = np.std(rf_times)

keras_avg = np.average(keras_times)
keras_std = np.std(keras_times)

fastai_avg = np.average(fastai_times)
fastai_std = np.std(fastai_times)

print('Model\tMean Time per Prediction')
print('RF\t{:.2f}\u00B1{:.2f}ms'.format(rf_avg*1000, rf_std*1000))
print('Keras\t{:.2f}\u00B1{:.2f}ms'.format(keras_avg*1000, keras_std*1000))
print('Fastai\t{:.2f}\u00B1{:.2f}ms'.format(fastai_avg*1000, fastai_std*1000))

Model	Mean Time per Prediction
RF	7.99±0.74ms
Keras	41.52±4.30ms
Fastai	54.59±8.88ms
