# Keras-Tensorflow Experiments

In [1]:
import keras
import pandas as pd
import numpy as np
import os
import sys
import glob
import time
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.utils.np_utils import to_categorical, normalize
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from tensorflow.keras.callbacks import TensorBoard

Using TensorFlow backend.


## Download datasets
- if zip dataset files already exists; no download is done
- force unzip all the .zip files

In [2]:
%%bash
URL=https://iscxdownloads.cs.unb.ca/iscxdownloads/ISCX-URL-2016/
FILES=(ISCXURL2016.zip) 
for FILE in ${FILES[*]}; do
    if [ ! -f "$FILE" ]; then
        printf "downloading %s\n" $FILE
        curl -O $URL$FILE
        # unzip files
        echo 'unzipping ' $FILE
        unzip -o $FILE #overwrite exiting files/folders if exists
    fi
done

## Check CSV files inside FinalDataset folder

In [3]:
! ls FinalDataset

All_BestFirst.csv	      Malware_Infogain.csv
All_BestFirst_test.csv	      Malware_Infogain_test.csv
All.csv			      Phishing_BestFirst.csv
All.csv.pickle		      Phishing.csv
All_Infogain.csv	      Phishing_Infogain.csv
All_Infogain_test.csv	      Phishing_Infogain_test.csv
Defacement_BestFirst.csv      Spam_BestFirst.csv
Defacement.csv		      Spam_BestFirst_test.csv
Defacement_Infogain.csv       Spam.csv
Defacement_Infogain_test.csv  Spam_Infogain.csv
Malware_BestFirst.csv	      Spam_Infogain_test.csv
Malware.csv		      URL


## Analyze FinalDataset/All.csv file

In [30]:
df = pd.read_csv('FinalDataset/All.csv', low_memory=False)
resultPath = 'results_keras_tensorflow'
if not os.path.exists(resultPath):
    print('result path {} created.'.format(resultPath))
    os.mkdir(resultPath)

In [5]:
df.shape

(36707, 80)

In [6]:
df['argPathRatio'].astype('float')

0        0.076923
1        0.058824
2        0.060606
3        0.025974
4        0.040816
5        0.033898
6        0.046512
7        0.040000
8        0.045455
9        0.090909
10       0.043478
11       0.039216
12       0.095238
13       0.105263
14       0.080000
15       0.086957
16       0.038462
17       0.083333
18       0.017241
19       0.016949
20       0.020408
21       0.012579
22       0.014815
23       0.014085
24       0.012500
25       0.018182
26       0.050000
27       0.037037
28       0.039216
29       0.036364
           ...   
36677    0.792593
36678    0.763636
36679    0.794118
36680    0.015625
36681    0.057143
36682    0.085106
36683    0.662651
36684    0.100000
36685    0.407407
36686    0.036364
36687    0.578125
36688    0.129412
36689    0.712644
36690    0.785047
36691    0.086957
36692    0.040000
36693    0.068966
36694    0.742574
36695    0.015267
36696    0.071429
36697    0.985386
36698    0.817308
36699    0.275000
36700    0.052632
36701    0

In [7]:
df.columns

Index(['Querylength', 'domain_token_count', 'path_token_count',
       'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
       'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
       'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
       'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
       'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
       'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
       'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
       'ISIpAddressInDomainName', 'CharacterContinuityRate',
       'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
       'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
       'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
       'Directory_LetterCount', 'Filename_LetterCount',
       'Extension_LetterCount', 'Query_LetterCount', 'LongestPathToken

In [8]:
df.shape

(36707, 80)

In [9]:
df.head()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
0,0,4,5,5.5,14,4.4,4,8,3,0,...,1,0,-1,0.726298,0.784493,0.894886,0.850608,,-1.0,Defacement
1,0,4,5,5.5,14,6.0,4,12,4,0,...,0,0,-1,0.688635,0.784493,0.814725,0.859793,0.0,-1.0,Defacement
2,0,4,5,5.5,14,5.8,4,12,5,0,...,0,0,-1,0.695049,0.784493,0.814725,0.80188,0.0,-1.0,Defacement
3,0,4,12,5.5,14,5.5,4,32,16,0,...,0,0,-1,0.64013,0.784493,0.814725,0.66321,0.0,-1.0,Defacement
4,0,4,6,5.5,14,7.333334,4,18,11,0,...,0,0,-1,0.681307,0.784493,0.814725,0.804526,0.0,-1.0,Defacement


In [10]:
df.tail()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
36702,29,4,14,5.75,12,3.666667,4,20,24,3,...,3,2,7,0.690555,0.791265,0.777498,0.690227,0.656684,0.796205,spam
36703,0,4,13,3.75,8,8.461538,4,24,23,0,...,16,15,-1,0.665492,0.82001,0.879588,0.6744,0.674671,-1.0,spam
36704,58,3,27,6.666666,16,3.375,3,41,34,20,...,8,7,9,0.656807,0.801139,0.684777,0.713622,0.717187,0.705245,spam
36705,35,3,13,4.333334,9,3.6,3,15,13,7,...,9,8,3,0.725963,0.897617,0.871049,0.745932,0.758824,0.790772,spam
36706,40,3,25,6.666666,16,3.25,3,35,31,19,...,7,6,7,0.674351,0.801139,0.697282,0.730563,0.731481,0.769238,spam


## Clean Data
- dropped samples with Infinity values
- dropped samples with NaN values

In [11]:
def loadData(csvFile):
    pickleDump = '{}.pickle'.format(csvFile)
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
    else:
        df = pd.read_csv(csvFile, low_memory=False)
        # clean data
        # strip the whitspaces from column names
        df = df.rename(str.strip, axis='columns')
        #df.drop(columns=[], inplace=True)
        # drop missing values/NaN etc.
        #df.dropna(inplace=True)
        # drop Infinity rows and NaN string from each column
        for col in df.columns:
            indexNames = df[df[col]=='Infinity'].index
            if not indexNames.empty:
                print('deleting {} rows with Infinity in column {}'.format(len(indexNames), col))
                df.drop(indexNames, inplace=True)
            indexNames = df[df[col]=='NaN'].index
            if not indexNames.empty:
                print('deleting {} rows with NaN in column {}'.format(len(indexNames), col))
                df.drop(indexNames, inplace=True)
        
        df.to_pickle(pickleDump)
    
    return df

## Test loadData function

In [12]:
dataFile = 'FinalDataset/All.csv'
df = loadData(dataFile)

In [13]:
df.columns

Index(['Querylength', 'domain_token_count', 'path_token_count',
       'avgdomaintokenlen', 'longdomaintokenlen', 'avgpathtokenlen', 'tld',
       'charcompvowels', 'charcompace', 'ldl_url', 'ldl_domain', 'ldl_path',
       'ldl_filename', 'ldl_getArg', 'dld_url', 'dld_domain', 'dld_path',
       'dld_filename', 'dld_getArg', 'urlLen', 'domainlength', 'pathLength',
       'subDirLen', 'fileNameLen', 'this.fileExtLen', 'ArgLen', 'pathurlRatio',
       'ArgUrlRatio', 'argDomanRatio', 'domainUrlRatio', 'pathDomainRatio',
       'argPathRatio', 'executable', 'isPortEighty', 'NumberofDotsinURL',
       'ISIpAddressInDomainName', 'CharacterContinuityRate',
       'LongestVariableValue', 'URL_DigitCount', 'host_DigitCount',
       'Directory_DigitCount', 'File_name_DigitCount', 'Extension_DigitCount',
       'Query_DigitCount', 'URL_Letter_Count', 'host_letter_count',
       'Directory_LetterCount', 'Filename_LetterCount',
       'Extension_LetterCount', 'Query_LetterCount', 'LongestPathToken

In [14]:
df.shape

(36697, 80)

In [15]:
df['NumberRate_Extension'][:10]

0    1.0
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    1.0
Name: NumberRate_Extension, dtype: float64

In [16]:
df.shape

(36697, 80)

In [17]:
df.head()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
0,0,4,5,5.5,14,4.4,4,8,3,0,...,1,0,-1,0.726298,0.784493,0.894886,0.850608,,-1.0,Defacement
1,0,4,5,5.5,14,6.0,4,12,4,0,...,0,0,-1,0.688635,0.784493,0.814725,0.859793,0.0,-1.0,Defacement
2,0,4,5,5.5,14,5.8,4,12,5,0,...,0,0,-1,0.695049,0.784493,0.814725,0.80188,0.0,-1.0,Defacement
3,0,4,12,5.5,14,5.5,4,32,16,0,...,0,0,-1,0.64013,0.784493,0.814725,0.66321,0.0,-1.0,Defacement
4,0,4,6,5.5,14,7.333334,4,18,11,0,...,0,0,-1,0.681307,0.784493,0.814725,0.804526,0.0,-1.0,Defacement


## Experimenting with FinalDataset/All.csv

## Total samples for each type

In [18]:
# total samples
label = 'URL_Type_obf_Type'
lblTypes = set(df[label])
for lbl in lblTypes:
    print('| {} | {} |'.format(lbl, len(df[df[label] == lbl].index)))

| malware | 6711 |
| benign | 7781 |
| spam | 6698 |
| phishing | 7577 |
| Defacement | 7930 |


In [19]:
dataPath = 'FinalDataset'
dep_var = label
cat_names = []
cont_names = list(set(df.columns) - set(cat_names) - set([dep_var]))

In [20]:
cont_names

['NumberRate_Extension',
 'LongestPathTokenLength',
 'NumberofDotsinURL',
 'longdomaintokenlen',
 'Extension_LetterCount',
 'sub-Directory_LongestWordLength',
 'ISIpAddressInDomainName',
 'delimeter_path',
 'ArgLen',
 'domain_token_count',
 'argPathRatio',
 'delimeter_Count',
 'pathDomainRatio',
 'Entropy_Filename',
 'NumberRate_FileName',
 'path_token_count',
 'avgdomaintokenlen',
 'host_DigitCount',
 'dld_domain',
 'SymbolCount_Domain',
 'File_name_DigitCount',
 'pathLength',
 'pathurlRatio',
 'SymbolCount_Afterpath',
 'dld_getArg',
 'URL_Letter_Count',
 'ArgUrlRatio',
 'SymbolCount_URL',
 'Directory_DigitCount',
 'Entropy_Domain',
 'tld',
 'Query_LetterCount',
 'SymbolCount_Directoryname',
 'CharacterContinuityRate',
 'Query_DigitCount',
 'Filename_LetterCount',
 'isPortEighty',
 'ldl_path',
 'URL_sensitiveWord',
 'Entropy_URL',
 'domainlength',
 'URLQueries_variable',
 'charcompvowels',
 'ldl_filename',
 'NumberRate_DirectoryName',
 'spcharUrl',
 'dld_url',
 'ldl_url',
 'Entropy_Ex

In [21]:
df['argPathRatio'] # was reading as object

0         0.07692308
1         0.05882353
2        0.060606062
3        0.025974026
4        0.040816326
5        0.033898305
6        0.046511628
7               0.04
8        0.045454547
9         0.09090909
10        0.04347826
11       0.039215688
12         0.0952381
13        0.10526316
14              0.08
15        0.08695652
16        0.03846154
17       0.083333336
18        0.01724138
19       0.016949153
20       0.020408163
21       0.012578616
22       0.014814815
23       0.014084507
24            0.0125
25       0.018181818
26              0.05
27       0.037037037
28       0.039215688
29       0.036363635
            ...     
36677      0.7925926
36678     0.76363635
36679      0.7941176
36680       0.015625
36681    0.057142857
36682     0.08510638
36683      0.6626506
36684            0.1
36685      0.4074074
36686    0.036363635
36687       0.578125
36688     0.12941177
36689      0.7126437
36690     0.78504676
36691     0.08695652
36692           0.04
36693     0.0

In [22]:
df.argPathRatio = df['argPathRatio'].astype('float')

In [23]:
def baseline_model(inputDim=-1, out_shape=(-1,)):
    global model_name
    model = Sequential()
    if inputDim > 0 and out_shape[1] > 0:
        model.add(Dense(79, activation='relu', input_shape=(inputDim,)))
        print(f"out_shape[1]:{out_shape[1]}")
        model.add(Dense(128, activation='relu'))
        
        model.add(Dense(out_shape[1], activation='softmax')) #This is the output layer
        
        if out_shape[1] > 2:
            print('Categorical Cross-Entropy Loss Function')
            model_name += "_categorical"
            model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
        else:
            model_name += "_binary"
            print('Binary Cross-Entropy Loss Function')
            model.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=['accuracy'])
    return model

In [24]:
optimizer='adam'
epochs=10 
batch_size=10
    
#Creating data for analysis
time_gen = int(time.time())
global model_name
model_name = f"{dataFile}_{time_gen}"
#$ tensorboard --logdir=logs/
tensorboard = TensorBoard(log_dir='logs/{}'.format(model_name))

seed = 7
np.random.seed(seed)
cvscores = []
print('optimizer: {} epochs: {} batch_size: {}'.format(
    optimizer, epochs, batch_size))

data = df
data_y = data.pop(str(dep_var))

#transform named labels into numerical values
encoder = LabelEncoder()
encoder.fit(data_y)
data_y = encoder.transform(data_y)
dummy_y = to_categorical(data_y)
data_x = normalize(data.values)

#define 5-fold cross validation test harness
inputDim = len(data_x[0])
print('inputdim = ', inputDim)


#Separate out data
X_train, X_test, y_train, y_test = train_test_split(data_x, dummy_y, test_size=0.2)
#     num=0
#     sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
#     for train_index, test_index in sss.split(X=np.zeros(data_x.shape[0]), y=dummy_y):
#         X_train, X_test = data_x[train_index], data_x[test_index]
#         y_train, y_test = dummy_y[train_index], dummy_y[test_index]

#create model
model = baseline_model(inputDim, y_train.shape)

#train
#print("Training " + dataFile + " on split " + str(num))
model.fit(x=X_train, y=y_train, epochs=epochs, batch_size=batch_size, verbose=2, callbacks=[tensorboard], validation_data=(X_test, y_test))

#save model
model.save('{}.model'.format(os.path.basename(dataPath)))

#         num+=1


scores = model.evaluate(X_test, y_test, verbose=1)
print(model.metrics_names)
acc, loss = scores[1]*100, scores[0]*100
print('Baseline: accuracy: {:.2f}%: loss: {:.2f}'.format(acc, loss))

resultFile = os.path.join(resultPath, 'All.csv')
with open('{}.result'.format(resultFile), 'a') as fout:
    fout.write('{} results...'.format(model_name))
    fout.write('\taccuracy: {:.2f} loss: {:.2f}\n'.format(acc, loss))

W0709 15:53:28.966247 140560386332288 deprecation_wrapper.py:119] From /usr/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0709 15:53:28.979408 140560386332288 deprecation_wrapper.py:119] From /usr/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0709 15:53:28.982769 140560386332288 deprecation_wrapper.py:119] From /usr/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0709 15:53:29.031055 140560386332288 deprecation_wrapper.py:119] From /usr/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0709 15:53:29.058817 140560386332288 deprecation_wrapper.py:119] From /usr/lib/

optimizer: adam epochs: 10 batch_size: 10
inputdim =  79
out_shape[1]:5
Categorical Cross-Entropy Loss Function


W0709 15:53:29.245154 140560386332288 deprecation.py:323] From /usr/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0709 15:53:29.314332 140560386332288 deprecation_wrapper.py:119] From /usr/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 29357 samples, validate on 7340 samples
Epoch 1/10
 - 4s - loss: nan - acc: 0.2170 - val_loss: nan - val_acc: 0.2124
Epoch 2/10
 - 3s - loss: nan - acc: 0.2170 - val_loss: nan - val_acc: 0.2124
Epoch 3/10
 - 3s - loss: nan - acc: 0.2170 - val_loss: nan - val_acc: 0.2124
Epoch 4/10
 - 3s - loss: nan - acc: 0.2170 - val_loss: nan - val_acc: 0.2124
Epoch 5/10
 - 3s - loss: nan - acc: 0.2170 - val_loss: nan - val_acc: 0.2124
Epoch 6/10
 - 3s - loss: nan - acc: 0.2170 - val_loss: nan - val_acc: 0.2124
Epoch 7/10
 - 3s - loss: nan - acc: 0.2170 - val_loss: nan - val_acc: 0.2124
Epoch 8/10
 - 3s - loss: nan - acc: 0.2170 - val_loss: nan - val_acc: 0.2124
Epoch 9/10
 - 3s - loss: nan - acc: 0.2170 - val_loss: nan - val_acc: 0.2124
Epoch 10/10
 - 3s - loss: nan - acc: 0.2170 - val_loss: nan - val_acc: 0.2124
['loss', 'acc']
Baseline: accuracy: 21.24%: loss: nan


In [26]:
preds, y, losses = model.get_preds(with_loss=True)
interp = ClassificationInterpretation(model, preds, y, losses)

AttributeError: 'Sequential' object has no attribute 'get_preds'

In [None]:
interp.plot_confusion_matrix(slice_size=10)

In [None]:
print(interp.confusion_matrix())

In [None]:
interp.most_confused()