In [1]:
# Import the dataset saved on the google drive
from google.colab import drive

# Data management
import pandas as pd
import numpy as np

# For S-10-fold CV
from sklearn.model_selection import StratifiedKFold

# Random Forest model
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import recall_score, precision_score
from keras.metrics import BinaryAccuracy, CategoricalAccuracy, Precision, Recall

# Keras DNN Model
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout
from keras.regularizers import l2
from keras.utils import to_categorical, normalize

# Fast.ai DNN Model
from fastai.tabular import *

# For timing of the models across different runtimes (CPU, GPU, TPU)
from time import time

# Uncomment when using GPU
"""# Set up for GPU Usage
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))"""

%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

print('Imports complete.')

Tensorflow version 2.3.0
Running on TPU  ['10.69.29.178:8470']
INFO:tensorflow:Initializing the TPU system: grpc://10.69.29.178:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.69.29.178:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


Imports complete.


In [2]:
# Uncomment when using GPU
"""# Test the GPU
%tensorflow_version 2.x
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))"""

'# Test the GPU\n%tensorflow_version 2.x\nimport tensorflow as tf\nimport timeit\n\ndevice_name = tf.test.gpu_device_name()\nif device_name != \'/device:GPU:0\':\n  print(\n      \'\n\nThis error most likely means that this notebook is not \'\n      \'configured to use a GPU.  Change this in Notebook Settings via the \'\n      \'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n\')\n  raise SystemError(\'GPU device not found\')\n\ndef cpu():\n  with tf.device(\'/cpu:0\'):\n    random_image_cpu = tf.random.normal((100, 100, 100, 3))\n    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)\n    return tf.math.reduce_sum(net_cpu)\n\ndef gpu():\n  with tf.device(\'/device:GPU:0\'):\n    random_image_gpu = tf.random.normal((100, 100, 100, 3))\n    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)\n    return tf.math.reduce_sum(net_gpu)\n  \n# We run each op once to warm up; see: https://stackoverflow.com/a/45067900\ncpu()\ngpu()\n\n# Run the op several times.\nprint(\'Ti

In [3]:
# Set up google drive access
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
# Import the data
path = '/content/gdrive/My Drive/FinalDataset/'
fille = 'All.csv'
df = pd.read_csv(path + fille)
print('Data Read:')
print(df.head())

Data Read:
   Querylength  domain_token_count  ...  Entropy_Afterpath  URL_Type_obf_Type
0            0                   4  ...               -1.0         Defacement
1            0                   4  ...               -1.0         Defacement
2            0                   4  ...               -1.0         Defacement
3            0                   4  ...               -1.0         Defacement
4            0                   4  ...               -1.0         Defacement

[5 rows x 80 columns]


In [5]:
dep_var = 'URL_Type_obf_Type'

print('There are {} columns and {} rows in the provided data.'.format(len(df.columns), len(df)))

There are 80 columns and 36697 rows in the provided data.


In [6]:
print('Below is the dataset\'s composition')
print(df[dep_var].value_counts())

Below is the dataset's composition
Defacement    7930
benign        7781
phishing      7577
malware       6711
spam          6698
Name: URL_Type_obf_Type, dtype: int64


In [7]:
nans = 0

# This is a kind of stupid way of counting how many NaNs show up within the data
for index, row in df.iterrows():
  for col in df.columns:
    if col == dep_var:
      continue

    #print(row[col], end=' ')
    # If the value is NaN, then mark add one to our counter
    if np.isnan(row[col]):
     nans += 1
  #print('')
print('NaNs detected: {}'.format(nans))

NaNs detected: 19153


In [8]:
# Removes all rows if they contain NaN values
df.dropna(axis='index', inplace=True)

In [9]:
print('There are {} columns and {} rows in the provided data.'.format(len(df.columns), len(df)))

print('Below is the dataset\'s composition')
print(df[dep_var].value_counts())

nans = 0

# This is a kind of stupid way of counting how many NaNs show up within the data
for index, row in df.iterrows():
  for col in df.columns:
    if col == dep_var:
      continue

    #print(row[col], end=' ')
    # If the value is NaN, then mark add one to our counter
    if np.isnan(row[col]):
     nans += 1
  #print('')
print('NaNs detected: {}'.format(nans))

There are 80 columns and 18982 rows in the provided data.
Below is the dataset's composition
spam          5342
malware       4440
phishing      4014
benign        2709
Defacement    2477
Name: URL_Type_obf_Type, dtype: int64
NaNs detected: 0


In [10]:
# Create the X (data) and y (labels)
X = normalize( df.loc[:, df.columns != dep_var] )
y = df[dep_var]

In [11]:
print(X.head())

    Querylength  domain_token_count  ...  Entropy_Extension  Entropy_Afterpath
35     0.000000            0.045198  ...           0.011299          -0.011299
37     0.117281            0.021324  ...           0.004037           0.003994
38     0.121354            0.021105  ...           0.003990           0.003948
39     0.117281            0.021324  ...           0.004000           0.003953
40     0.121354            0.021105  ...           0.003990           0.003948

[5 rows x 79 columns]


In [12]:
print(y.head())

35    Defacement
37    Defacement
38    Defacement
39    Defacement
40    Defacement
Name: URL_Type_obf_Type, dtype: object


In [13]:
# Create the stratified cross validation object
random_state = 0
sss = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)
print(sss)

StratifiedKFold(n_splits=10, random_state=0, shuffle=True)


## Multi-Classification Experiments


In [14]:
fold = 0
training_times = {'rf': [], 
                  'keras': [],
                  'fastai': []}

for train_idx, test_idx in sss.split(X, y):
  # Update which fold we are on (this is just for output/usability reasons)
  fold += 1

  # Split the data into the train and testing sets
  X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

  # Initialize the models (not fast.ai since it needs a databunch object)
  rf = RandomForestClassifier(random_state=random_state)
  dnn_keras = Sequential(layers=[
                                 Dense(128, kernel_regularizer=l2(0.001), activation='relu',input_shape=(len(X_train.columns),)),
                                 BatchNormalization(),
                                 Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
                                 BatchNormalization(),
                                 Dense(y_train.nunique(), activation='softmax')
  ])
  dnn_keras.compile(
      optimizer='adam', 
      loss='categorical_crossentropy', 
      metrics=['accuracy', 'Recall', 'Precision'])
  
  print('Training RandomForest model with Fold {}...'.format(fold), end='')
  t0 = time()
  rf.fit(X_train, y_train)
  t1 = time()
  training_times['rf'].append(t1-t0)
  print('done')

  print('Training Keras-TensorFlow DNN model with Fold {}...'.format(fold), end='')
  t0 = time()
  dnn_keras.fit(X_train, pd.get_dummies(y_train), epochs=100, verbose=0, batch_size=1024)
  t1 = time()
  training_times['keras'].append(t1-t0)
  print('done')

  # Initialize and run fast.ai model
  print('Training Fast.ai Fold {}...'.format(fold))
  data_fold = (TabularList.from_df(df, path=path, cont_names=X_train.columns, procs=[Categorify, Normalize])
                     .split_by_idxs(train_idx, test_idx)
                     .label_from_df(cols=dep_var)
                     .databunch())
  dnn_fastai = tabular_learner(data_fold, layers=[200, 100], metrics=accuracy)

  t0 = time()
  dnn_fastai.fit_one_cycle(cyc_len=10, callbacks=None)
  t1 = time()
  training_times['fastai'].append(t1-t0)
  print('Fast ai done')


Training RandomForest model with Fold 1...done
Training Keras-TensorFlow DNN model with Fold 1...done
Training Fast.ai Fold 1...


epoch,train_loss,valid_loss,accuracy,time
0,0.443972,0.394688,0.890469,00:03
1,0.324754,0.276525,0.903633,00:03
2,0.25845,0.232733,0.921011,00:03
3,0.214259,0.194339,0.939442,00:03
4,0.17284,0.153454,0.949974,00:03
5,0.152054,0.136953,0.947867,00:03
6,0.114871,0.098491,0.965771,00:03
7,0.097962,0.088316,0.968931,00:03
8,0.06781,0.083455,0.971564,00:04
9,0.067592,0.08132,0.971037,00:03


Fast ai done
Training RandomForest model with Fold 2...done
Training Keras-TensorFlow DNN model with Fold 2...done
Training Fast.ai Fold 2...


epoch,train_loss,valid_loss,accuracy,time
0,0.452407,0.32443,0.888362,00:03
1,0.309429,0.303231,0.897841,00:03
2,0.247215,0.24804,0.924697,00:03
3,0.21276,0.18665,0.940495,00:03
4,0.189193,0.183047,0.944181,00:03
5,0.143658,0.174258,0.955766,00:03
6,0.115553,0.117023,0.966298,00:03
7,0.090676,0.115941,0.968931,00:03
8,0.083242,0.110134,0.971564,00:03
9,0.068761,0.106405,0.971564,00:03


Fast ai done
Training RandomForest model with Fold 3...done
Training Keras-TensorFlow DNN model with Fold 3...done
Training Fast.ai Fold 3...


epoch,train_loss,valid_loss,accuracy,time
0,0.430835,0.327063,0.902002,00:03
1,0.302534,0.274705,0.906217,00:03
2,0.248354,0.236248,0.935195,00:03
3,0.229113,0.166951,0.946259,00:03
4,0.175157,0.153201,0.952582,00:03
5,0.152273,0.139416,0.958377,00:03
6,0.118886,0.117966,0.962592,00:03
7,0.091238,0.104937,0.967861,00:03
8,0.07688,0.100582,0.969442,00:03
9,0.062725,0.103795,0.96628,00:03


Fast ai done
Training RandomForest model with Fold 4...done
Training Keras-TensorFlow DNN model with Fold 4...done
Training Fast.ai Fold 4...


epoch,train_loss,valid_loss,accuracy,time
0,0.439759,0.375142,0.884089,00:03
1,0.313754,0.253433,0.916228,00:03
2,0.257929,0.250575,0.903056,00:03
3,0.210218,0.184989,0.934668,00:03
4,0.185125,0.159393,0.944152,00:03
5,0.146486,0.147829,0.949947,00:03
6,0.101248,0.120899,0.959431,00:03
7,0.1003,0.106148,0.959431,00:03
8,0.077153,0.094396,0.965227,00:03
9,0.067245,0.093589,0.965227,00:03


Fast ai done
Training RandomForest model with Fold 5...done
Training Keras-TensorFlow DNN model with Fold 5...done
Training Fast.ai Fold 5...


epoch,train_loss,valid_loss,accuracy,time
0,0.452079,0.32293,0.903583,00:03
1,0.29187,0.23614,0.917281,00:03
2,0.257851,0.229999,0.920969,00:03
3,0.205805,0.193918,0.936776,00:03
4,0.175795,0.168093,0.945732,00:03
5,0.145493,0.139279,0.955743,00:03
6,0.114853,0.114802,0.967861,00:03
7,0.089131,0.102392,0.970495,00:03
8,0.07278,0.096721,0.974183,00:03
9,0.066628,0.09682,0.975764,00:03


Fast ai done
Training RandomForest model with Fold 6...done
Training Keras-TensorFlow DNN model with Fold 6...done
Training Fast.ai Fold 6...


epoch,train_loss,valid_loss,accuracy,time
0,0.440017,0.371497,0.866702,00:03
1,0.306717,0.282406,0.901475,00:03
2,0.255411,0.192243,0.935195,00:03
3,0.214453,0.184053,0.937302,00:04
4,0.170478,0.153608,0.948367,00:04
5,0.150448,0.127107,0.955216,00:04
6,0.121577,0.118459,0.960485,00:04
7,0.084513,0.105783,0.966807,00:04
8,0.074029,0.101684,0.96628,00:03
9,0.067374,0.09983,0.967861,00:03


Fast ai done
Training RandomForest model with Fold 7...done
Training Keras-TensorFlow DNN model with Fold 7...done
Training Fast.ai Fold 7...


epoch,train_loss,valid_loss,accuracy,time
0,0.437113,0.354657,0.877766,00:03
1,0.304028,0.281872,0.905163,00:03
2,0.247046,0.229857,0.920443,00:03
3,0.210727,0.236753,0.912013,00:03
4,0.177423,0.165965,0.938356,00:03
5,0.147257,0.140057,0.955743,00:03
6,0.114634,0.129947,0.958904,00:03
7,0.093546,0.114386,0.959431,00:03
8,0.071813,0.103466,0.963646,00:03
9,0.071631,0.102617,0.965227,00:03


Fast ai done
Training RandomForest model with Fold 8...done
Training Keras-TensorFlow DNN model with Fold 8...done
Training Fast.ai Fold 8...


epoch,train_loss,valid_loss,accuracy,time
0,0.46105,0.352711,0.885142,00:03
1,0.310784,0.29574,0.913593,00:03
2,0.260749,0.190132,0.943625,00:03
3,0.227721,0.168463,0.942044,00:03
4,0.169278,0.187211,0.928346,00:03
5,0.139274,0.125485,0.95785,00:03
6,0.12481,0.113532,0.964173,00:03
7,0.088069,0.096613,0.969442,00:03
8,0.073829,0.088053,0.972603,00:03
9,0.072569,0.086115,0.975764,00:03


Fast ai done
Training RandomForest model with Fold 9...done
Training Keras-TensorFlow DNN model with Fold 9...done
Training Fast.ai Fold 9...


epoch,train_loss,valid_loss,accuracy,time
0,0.443847,0.360754,0.874605,00:03
1,0.282064,0.3211,0.890411,00:03
2,0.258792,0.225205,0.918335,00:03
3,0.21589,0.224564,0.927292,00:03
4,0.172108,0.184924,0.938356,00:03
5,0.152762,0.159161,0.949947,00:03
6,0.120494,0.153358,0.953109,00:03
7,0.092321,0.135817,0.959958,00:03
8,0.07948,0.125993,0.959431,00:03
9,0.06956,0.126576,0.963119,00:03


Fast ai done
Training RandomForest model with Fold 10...done
Training Keras-TensorFlow DNN model with Fold 10...done
Training Fast.ai Fold 10...


epoch,train_loss,valid_loss,accuracy,time
0,0.444413,0.305464,0.903056,00:03
1,0.308096,0.260561,0.907798,00:03
2,0.261442,0.216753,0.924131,00:03
3,0.212239,0.178532,0.935722,00:03
4,0.170041,0.144967,0.946786,00:03
5,0.148248,0.119032,0.965753,00:03
6,0.118745,0.102319,0.968915,00:03
7,0.085876,0.084891,0.971549,00:03
8,0.079471,0.078259,0.97313,00:03
9,0.070928,0.0782,0.97313,00:03


Fast ai done


In [15]:
print('model\tfold runtime\ttotal runtime')
print('-'*40)
for model in training_times.keys():
  mean = sum(training_times[model]) / len(training_times[model])
  std = np.std(training_times[model])

  print('{}\t{:.2f}\u00B1{:.2f}s\t{:.2f}s'.format(model, mean, std, sum(training_times[model])))

model	fold runtime	total runtime
----------------------------------------
rf	10.68±0.15s	106.83s
keras	12.41±0.14s	124.09s
fastai	36.23±1.79s	362.29s


## Binary Classification
This is mostly the same code for the execution of the models, however we have to change the data...

In [16]:
print('There are {} columns and {} rows in the provided data.'.format(len(X.columns)+1, len(X)))

print('Below is the dataset\'s composition')
print(y.value_counts())

There are 80 columns and 18982 rows in the provided data.
Below is the dataset's composition
spam          5342
malware       4440
phishing      4014
benign        2709
Defacement    2477
Name: URL_Type_obf_Type, dtype: int64


In [17]:
# Convert the multiclass problem into a binary classification problem
y = y.map(lambda label : label if label == 'benign' else 'malicious')

In [18]:
print('There are {} columns and {} rows in the provided data.'.format(len(X.columns)+1, len(X)))

print('Below is the dataset\'s composition')
print(y.value_counts())

There are 80 columns and 18982 rows in the provided data.
Below is the dataset's composition
malicious    16273
benign        2709
Name: URL_Type_obf_Type, dtype: int64


In [19]:
fold = 0
training_times = {'rf': [], 
                  'keras': [],
                  'fastai': []}

for train_idx, test_idx in sss.split(X, y):
  # Update which fold we are on (this is just for output/usability reasons)
  fold += 1

  # Split the data into the train and testing sets
  X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
  y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

  # Initialize the models (not fast.ai since it needs a databunch object)
  rf = RandomForestClassifier(random_state=random_state)
  dnn_keras = Sequential(layers=[
                                 Dense(128, kernel_regularizer=l2(0.001), activation='relu',input_shape=(len(X_train.columns),)),
                                 BatchNormalization(),
                                 Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
                                 BatchNormalization(),
                                 Dense(y_train.nunique(), activation='softmax')
  ])
  dnn_keras.compile(
      optimizer='adam', 
      loss='categorical_crossentropy', 
      metrics=['accuracy', 'Recall', 'Precision'])
  
  print('Training RandomForest model with Fold {}...'.format(fold), end='')
  t0 = time()
  rf.fit(X_train, y_train)
  t1 = time()
  training_times['rf'].append(t1-t0)
  print('done')

  print('Training Keras-TensorFlow DNN model with Fold {}...'.format(fold), end='')
  t0 = time()
  dnn_keras.fit(X_train, pd.get_dummies(y_train), epochs=100, verbose=0, batch_size=1024)
  t1 = time()
  training_times['keras'].append(t1-t0)
  print('done')

  # Initialize and run fast.ai model
  print('Training Fast.ai Fold {}...'.format(fold))
  data_fold = (TabularList.from_df(df, path=path, cont_names=X_train.columns, procs=[Categorify, Normalize])
                     .split_by_idxs(train_idx, test_idx)
                     .label_from_df(cols=dep_var)
                     .databunch())
  dnn_fastai = tabular_learner(data_fold, layers=[200, 100], metrics=accuracy)

  t0 = time()
  dnn_fastai.fit_one_cycle(cyc_len=10, callbacks=None)
  t1 = time()
  training_times['fastai'].append(t1-t0)
  print('Fast ai done')


Training RandomForest model with Fold 1...done
Training Keras-TensorFlow DNN model with Fold 1...done
Training Fast.ai Fold 1...


epoch,train_loss,valid_loss,accuracy,time
0,0.439352,0.367398,0.882043,00:03
1,0.2948,0.249237,0.923117,00:03
2,0.249121,0.277552,0.904687,00:03
3,0.206389,0.229741,0.919431,00:03
4,0.170854,0.188425,0.933123,00:03
5,0.150878,0.148725,0.951553,00:03
6,0.114326,0.128837,0.961032,00:03
7,0.095658,0.108668,0.966825,00:03
8,0.075021,0.105058,0.967878,00:03
9,0.069442,0.105801,0.966825,00:03


Fast ai done
Training RandomForest model with Fold 2...done
Training Keras-TensorFlow DNN model with Fold 2...done
Training Fast.ai Fold 2...


epoch,train_loss,valid_loss,accuracy,time
0,0.443808,0.349298,0.890469,00:03
1,0.328726,0.267032,0.899421,00:03
2,0.245205,0.249358,0.917325,00:03
3,0.206149,0.179349,0.941022,00:03
4,0.167178,0.162772,0.944181,00:03
5,0.141314,0.139901,0.951027,00:03
6,0.114148,0.1296,0.957346,00:03
7,0.095606,0.126646,0.964192,00:03
8,0.078518,0.103575,0.965245,00:03
9,0.070145,0.105629,0.964718,00:03


Fast ai done
Training RandomForest model with Fold 3...done
Training Keras-TensorFlow DNN model with Fold 3...done
Training Fast.ai Fold 3...


epoch,train_loss,valid_loss,accuracy,time
0,0.458496,0.326224,0.896733,00:03
1,0.304211,0.262323,0.913593,00:03
2,0.254271,0.185952,0.938356,00:03
3,0.198441,0.207786,0.927819,00:03
4,0.171139,0.162103,0.944679,00:03
5,0.144013,0.126602,0.957323,00:03
6,0.115642,0.113167,0.960485,00:03
7,0.091403,0.101213,0.9647,00:03
8,0.07626,0.094612,0.971022,00:03
9,0.067631,0.094815,0.971022,00:03


Fast ai done
Training RandomForest model with Fold 4...done
Training Keras-TensorFlow DNN model with Fold 4...done
Training Fast.ai Fold 4...


epoch,train_loss,valid_loss,accuracy,time
0,0.436839,0.333472,0.891465,00:03
1,0.275361,0.271815,0.91412,00:03
2,0.255936,0.203281,0.929926,00:03
3,0.217399,0.181168,0.943098,00:03
4,0.17353,0.150587,0.944152,00:03
5,0.145079,0.132953,0.957323,00:03
6,0.111988,0.118181,0.961012,00:03
7,0.083108,0.101193,0.96628,00:03
8,0.075591,0.09639,0.967861,00:03
9,0.068413,0.093429,0.967861,00:03


Fast ai done
Training RandomForest model with Fold 5...done
Training Keras-TensorFlow DNN model with Fold 5...done
Training Fast.ai Fold 5...


epoch,train_loss,valid_loss,accuracy,time
0,0.446506,0.355343,0.885142,00:03
1,0.295786,0.258414,0.915174,00:03
2,0.262102,0.2213,0.928873,00:03
3,0.217036,0.182156,0.942044,00:03
4,0.175051,0.148221,0.947313,00:03
5,0.147664,0.119237,0.961012,00:03
6,0.11191,0.109635,0.95627,00:04
7,0.09754,0.097229,0.962065,00:03
8,0.075473,0.093643,0.964173,00:04
9,0.072959,0.088237,0.965753,00:03


Fast ai done
Training RandomForest model with Fold 6...done
Training Keras-TensorFlow DNN model with Fold 6...done
Training Fast.ai Fold 6...


epoch,train_loss,valid_loss,accuracy,time
0,0.434391,0.337141,0.894099,00:03
1,0.30056,0.254543,0.92255,00:03
2,0.26932,0.214971,0.929399,00:03
3,0.211888,0.194563,0.940991,00:03
4,0.173421,0.153448,0.946259,00:03
5,0.138925,0.14805,0.955743,00:03
6,0.125605,0.131256,0.959958,00:03
7,0.09286,0.351419,0.964173,00:03
8,0.078291,0.108741,0.969968,00:03
9,0.069403,0.108765,0.970495,00:03


Fast ai done
Training RandomForest model with Fold 7...done
Training Keras-TensorFlow DNN model with Fold 7...done
Training Fast.ai Fold 7...


epoch,train_loss,valid_loss,accuracy,time
0,0.452998,0.348238,0.89568,00:03
1,0.295934,0.280981,0.892518,00:03
2,0.259328,0.220409,0.930453,00:03
3,0.219278,0.208406,0.931507,00:03
4,0.172232,0.172515,0.93941,00:03
5,0.150215,0.139417,0.952055,00:03
6,0.130436,0.113223,0.959431,00:03
7,0.090894,0.10344,0.96628,00:03
8,0.078662,0.095052,0.971022,00:03
9,0.070259,0.094234,0.969442,00:03


Fast ai done
Training RandomForest model with Fold 8...done
Training Keras-TensorFlow DNN model with Fold 8...done
Training Fast.ai Fold 8...


epoch,train_loss,valid_loss,accuracy,time
0,0.439793,0.331649,0.894626,00:03
1,0.287083,0.26594,0.910959,00:03
2,0.252122,0.245074,0.91254,00:03
3,0.213176,0.191481,0.932561,00:03
4,0.17699,0.14616,0.954689,00:03
5,0.150042,0.14062,0.953109,00:03
6,0.117693,0.10998,0.965227,00:03
7,0.084849,0.103989,0.967334,00:03
8,0.084888,0.097854,0.969442,00:03
9,0.069786,0.095053,0.97313,00:03


Fast ai done
Training RandomForest model with Fold 9...done
Training Keras-TensorFlow DNN model with Fold 9...done
Training Fast.ai Fold 9...


epoch,train_loss,valid_loss,accuracy,time
0,0.446044,0.368125,0.892518,00:03
1,0.326512,0.264725,0.905163,00:03
2,0.279321,0.203007,0.92255,00:03
3,0.215577,0.143774,0.953109,00:03
4,0.172724,0.157965,0.943625,00:03
5,0.154221,0.117418,0.95627,00:03
6,0.124522,0.093709,0.968388,00:03
7,0.096618,0.08981,0.968388,00:03
8,0.074996,0.083972,0.974183,00:03
9,0.069911,0.086572,0.971022,00:03


Fast ai done
Training RandomForest model with Fold 10...done
Training Keras-TensorFlow DNN model with Fold 10...done
Training Fast.ai Fold 10...


epoch,train_loss,valid_loss,accuracy,time
0,0.452522,0.363489,0.874605,00:03
1,0.319623,0.253176,0.919389,00:03
2,0.246337,0.204261,0.935195,00:03
3,0.213069,0.196967,0.936249,00:03
4,0.186057,0.165316,0.940991,00:03
5,0.142172,0.129587,0.961538,00:03
6,0.108421,0.116111,0.963646,00:03
7,0.092285,0.101054,0.969968,00:03
8,0.073467,0.098425,0.971022,00:03
9,0.076218,0.100551,0.971549,00:03


Fast ai done


In [20]:
print('model\tfold runtime\ttotal runtime')
print('-'*40)
for model in training_times.keys():
  mean = sum(training_times[model]) / len(training_times[model])
  std = np.std(training_times[model])

  print('{}\t{:.2f}\u00B1{:.2f}s\t{:.2f}s'.format(model, mean, std, sum(training_times[model])))

model	fold runtime	total runtime
----------------------------------------
rf	9.56±0.14s	95.55s
keras	12.40±0.39s	124.02s
fastai	36.05±1.02s	360.49s
