<a href="https://colab.research.google.com/github/rambasnet/DeepLearningMaliciousURLs/blob/master/Keras_Tensorflow_Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Include needed files. 

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

import csv
import os
import sys
import glob
import operator
import time

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.utils.np_utils import to_categorical, normalize

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import shuffle

from tensorflow import keras
from tensorflow import feature_column
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation, BatchNormalization, Dropout
from tensorflow.keras.callbacks import TensorBoard

# Include Dataset


In [None]:
%%bash
URL=https://iscxdownloads.cs.unb.ca/iscxdownloads/ISCX-URL-2016/
FILES=(ISCXURL2016.zip) 
for FILE in ${FILES[*]}; do
    if [ ! -f "$FILE" ]; then
        printf "downloading %s\n" $FILE
        curl -O $URL$FILE
        # unzip files
        echo 'unzipping ' $FILE
        unzip -o $FILE #overwrite exiting files/folders if exists
    fi
done

### Check Dataset

In [None]:
! ls FinalDataset

# Set some data
> Some data needs to be set, we need to ensure that constants are set properly. These are important but will not be used until later.

In [None]:
resultPath = 'results_keras_tensorflow'
if not os.path.exists(resultPath):
   print('result path {} created.'.format(resultPath))
   os.mkdir(resultPath)

In [None]:
dep_var = 'Label'
model_name="init"

In [None]:
cat_names = []
cont_names = []

## Analyze FinalDataset/All.csv file
> lets make sure that the files are properly added, this should look similar to the FASTAI experiments.

In [None]:
df = pd.read_csv('FinalDataset/All.csv', low_memory=False)

In [None]:
df.shape

# Show all dataset column names

In [None]:
df.columns

# Show the first rows of the dataset

In [None]:
df.head()

# Show the last rows of the dataset

In [None]:
df.tail()

# Functions for Testing
> Now that our data has been collected it is time to create functions that will be used in later tests.

In [None]:
def loadData(csvFile):
    pickleDump = '{}.pickle'.format(csvFile)
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
    else:
        df = pd.read_csv(csvFile, low_memory=False, na_values='NaN')
        # clean data
        # strip the whitspaces from column names
        df = df.rename(str.strip, axis='columns')
        #df.drop(columns=[], inplace=True)
        # drop missing values/NaN etc.
        #df.dropna(inplace=True)
        # drop Infinity rows and NaN string from each column
        for col in df.columns:
            indexNames = df[df[col]=='Infinity'].index
            if not indexNames.empty:
                print('deleting {} rows with Infinity in column {}'.format(len(indexNames), col))
                df.drop(indexNames, inplace=True)
            indexNames = df[df[col]=='NaN'].index
            if not indexNames.empty:
                print('deleting {} rows with NaN in column {}'.format(len(indexNames), col))
                df.drop(indexNames, inplace=True)
        
        df.to_pickle(pickleDump)
    
    return df


In [None]:
def baseline_model(inputDim=-1,batch_size=32):
    global model_name
    model = tf.keras.Sequential([
        Dense(128, activation='relu', input_shape=(inputDim,)),
        BatchNormalization(),
        Dropout(.5),
    #print(f"out_shape[1]:{out_shape[1]}")
        Dense(batch_size, activation='relu'),
        BatchNormalization(),
        Dropout(.5),
        Dense(5, activation='sigmoid'),
    ]) #This is the output layer

    print('Categorical Cross-Entropy Loss Function')
    model_name += "_categorical"
    model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
#         else:
#             model_name += "_binary"
#             print('Binary Cross-Entropy Loss Function')
#             model.compile(optimizer='adam',
#                     loss='binary_crossentropy',
#                     metrics=['accuracy'])
    return model

In [None]:
def encode_labels(dataframe):
    dataframe=dataframe.copy()
    data_y=dataframe.pop(dep_var)
    encoder = LabelEncoder()
    encoder.fit(data_y)
    data_y = encoder.transform(data_y)
    dummy_y = to_categorical(data_y)
    return dummy_y

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    
    dataframe=dataframe.copy()
    
    #Encode the labels as numeric values
    labels = encode_labels(dataframe)
    
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

# Test LoadData Function
> This will look just like the FastAI test, but we are using Tensor, so lets make sure it works.

In [None]:
df1 = loadData('FinalDataset/All.csv')
df1=df1.dropna(axis=1)
print(df1)

In [None]:
df1.columns


In [None]:
df1.shape

In [None]:
df1.shape

In [None]:
df1.head()

  # Experimenting with Final Dataset/All.csv
  
  #### Total Samples for each Type

In [None]:
label = 'URL_Type_obf_Type'
lblTypes=set(df[label])
for lbl in lblTypes:
    print('| {} | {} |'.format(lbl, len(df[df[label] == lbl].index)))

In [None]:
dataPath = 'FinalDataset'
dep_var = label
cont_names = list(set(df.columns) - set(cat_names) - set([dep_var]))

In [None]:
cont_names

# Cast column values to float

In [None]:
df1.argPathRatio = df1['argPathRatio'].astype('float')

# Experimenting with Tensorflow Keras

#### Globals for Testing


In [None]:
dataFile = 'All.csv'
optimizer='adam'
epochs=10
batch_size=64
feature_columns = []

#### Numeric Columns setup

In [None]:
#feature columns to classify malicious URLs
for header in ['dld_getArg']:
  feature_columns.append(feature_column.numeric_column(header))

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
df1[dep_var]

#### Training Setup


In [None]:
time_gen = int(time.time())
global model_name

seed = 7
np.random.seed(seed)

model_name = f"{dataFile}_{time_gen}"

tensorboard = TensorBoard(log_dir='logs/{}'.format(model_name))

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

encoded_y = df1.copy()
encoded_y = encode_labels(encoded_y)

y=LabelEncoder().fit_transform(df1[dep_var].values)
X=StandardScaler().fit_transform(df1.drop(dep_var, axis=1))

for index, (train_indices, val_indices) in enumerate(kfold.split(X, y)):
    xtrain, xval = X[train_indices], X[val_indices]
    ytrain, yval = encoded_y[train_indices], encoded_y[val_indices]
    
    inputDim=xtrain.shape[1]
    
    print(inputDim)
    
    model = baseline_model(inputDim)
    model.fit(xtrain, ytrain, epochs=epochs, validation_data=(xval,yval), callbacks=[tensorboard], batch_size=batch_size)

# train, test = train_test_split(df1, test_size=0.2)
# train, val = train_test_split(train, test_size=0.2)
# val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
# test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

#### Create Data for Analysis

#### Define dimension of input values


#### Model Creation

In [None]:
model.save('{}.model'.format(os.path.basename(dataPath)))

#### Setup Final Results

In [None]:
scores = model.evaluate(X,encoded_y, verbose=1)
print(model.metrics_names)
acc, loss=scores[1]*100, scores[0]*100
print('Baseline: accuracy: {:.2f}%: loss: {:.2f}'.format(acc, loss))

resultFile = os.path.join(resultPath, dataFile)
with open('{}.result'.format(resultFile), 'a') as fout:
  fout.write('{} results...'.format(model_name))
  fout.write('\taccuracy: {:.2f} loss: {:.2f}\n'.format(acc, loss))