In [1]:
# Import the dataset saved on the google drive
from google.colab import drive

# Graphing capabilities
import matplotlib.pyplot as plt

# Data management
import pandas as pd
import numpy as np

# For stratified 10-fold cross validation
from sklearn.model_selection import StratifiedKFold

# Scikit-Learn ML Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Keras-TensorFlow DNN Model
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout
from keras.regularizers import l2

# Fast.ai DNN Model
from fastai.tabular import *

# Normalization
from keras.utils import normalize, to_categorical

print('Imports complete.')

Imports complete.


## Functions Used


In [2]:
def train_and_eval_on(X, y, feature_set):
    """
    train_and_eval_on function
        Description: This function will train all the models on the given feature set of the X (data) for predicting y (target)

        Args: 
            X => pd.DataFrame object containing the data
            y => pd.Series object containings the target classifications
            feature_set => list of features in X to use for training

        Returns:
            metrics => dictionary where the model names are the key and a list of accuracies across all folds is the value
                    Keys:
                        Random Forest => rf
                        Decision Tree => dt
                        k-Nearest Neighbors => knn
                        Support Vector Machine => svm
                        Logistic Regression => lr
                        Linear Discriminant Analysis => lda
                        AdaBoost => ab
                        Naive Bayes => nb
                        Keras-TensorFlow => keras
                        Fast.ai => fastai
    """
    metrics = {'rf':[],
                'dt':[],
                'knn':[],
                'svm':[],
                'lr':[],
                'lda':[],
                'ab':[],
                'nb':[],
                'keras':[],
                'fastai':[]}

    # Select the given features within the data
    X = X[feature_set]

    print('Training with {} features'.format(len(X.columns)))

    # Create stratified, 10-fold cross validation object
    random_state = 0
    sss = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

    # Experiment with 10-fold cross validation
    for train_idx, test_idx in sss.split(X, y):
        # Split the data into the training and testing sets
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Random Forest Model
        rf = RandomForestClassifier(random_state=random_state)
        rf.fit(X_train, y_train)
        score = rf.score(X_test, y_test)
        metrics['rf'].append(score)

        """# Decision Tree Model
        dt = DecisionTreeClassifier(random_state=random_state)
        dt.fit(X_train, y_train)
        score = dt.score(X_test, y_test)
        metrics['dt'].append(score)

        # k-Nearest Neighbors Model
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        score = knn.score(X_test, y_test)
        metrics['knn'].append(score)

        # Support Vector Machine Model
        svm = SVC(random_state=random_state)
        svm.fit(X_train, y_train)
        score = svm.score(X_test, y_test)
        metrics['svm'].append(score)

        # Logistic Regression Model
        lr = LogisticRegression(random_state=random_state)
        lr.fit(X_train, y_train)
        score = lr.score(X_test, y_test)
        metrics['lr'].append(score)

        # Linear Discriminant Analysis Model
        lda = LinearDiscriminantAnalysis()
        lda.fit(X_train, y_train)
        score = lda.score(X_test, y_test)
        metrics['lda'].append(score)

        # AdaBoost Model
        ab = AdaBoostClassifier(random_state=random_state)
        ab.fit(X_train, y_train)
        score = ab.score(X_test, y_test)
        metrics['ab'].append(score)

        # Naive Bayes Model
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        score = nb.score(X_test, y_test)
        metrics['nb'].append(score)"""

        # Keras-TensorFlow DNN Model
        dnn_keras = Sequential(layers=[
                                 Dense(128, kernel_regularizer=l2(0.001), activation='relu',input_shape=(len(X_train.columns),)),
                                 BatchNormalization(),
                                 Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
                                 BatchNormalization(),
                                 Dense(y_train.nunique(), activation='softmax')
        ])
        dnn_keras.compile(
            optimizer='adam', 
            loss='categorical_crossentropy', 
            metrics=['accuracy'])
        dnn_keras.fit(X_train, pd.get_dummies(y_train), epochs=100, verbose=0, batch_size=512)
        _, score = dnn_keras.evaluate(X_test, pd.get_dummies(y_test), verbose=0)
        metrics['keras'].append(score)

        # Fast.ai DNN Model
        data_fold = (TabularList.from_df(df, path=path, cont_names=X_train.columns, procs=[Categorify, Normalize])
                     .split_by_idxs(train_idx, test_idx)
                     .label_from_df(cols=dep_var)
                     .databunch(num_workers=0))
        dnn_fastai = tabular_learner(data_fold, layers=[200, 100], metrics=accuracy)
        dnn_fastai.fit_one_cycle(cyc_len=10, callbacks=None)
        _, score = dnn_fastai.validate()
        metrics['fastai'].append(score)

    return metrics

In [3]:
def show_graph(figure, feature_count, metrics_dict, exp_type=''):
  """
  show_graph function

    Description: This function will take the metrics dictionary provided and update the graph already to show the most recent results

    Args:
      figure => matplotlib.pyplot.figure object
      metrics_dict => dictionary of metrics as described in `train_and_eval_on` function
      exp_type => string indicating the type of experiment to change the title of the graph

    Returns:
      nothing
  """
  # Reorganize the data so we have all of the random forest metrics with increasing features side by side
  reorganized_dictionary = {}

  for feature_vals in metrics_dict.keys():
    for key in metrics_dict[feature_vals].keys():
      # If a given model is not in the new dictionary, add it
      if key not in reorganized_dictionary:
        reorganized_dictionary[key] = {}

      # If there isn't a specific feature number in the model dictionary, add it
      if feature_vals not in reorganized_dictionary[key]:
        reorganized_dictionary[key][feature_vals] = []

      # If there is anything to the record, add it
      if len( metrics_dict[feature_vals][key] ) > 0:
        accuracies = metrics_dict[feature_vals][key]
        mean = np.mean(accuracies)
        std = np.std(accuracies)

        #print('Accuracies: {}'.format(accuracies))
        #print('Mean: {}'.format(mean))
        #print('Std: {}'.format(std))

        reorganized_dictionary[key][feature_vals].append( [mean, std] ) 

  #print('Models: {}'.format( list(reorganized_dictionary.keys()) ))

  for model in reorganized_dictionary.keys():
    # The x-axis will have the feature_count
    xs = []

    # The y-axis will have the accuracy for that feature_count value
    ys = []

    # The y-axis will also have the std for these accuracies since they are accumulated over 10 folds
    yerrs = []

    for x in reorganized_dictionary[model].keys():
      if len(reorganized_dictionary[model][x]) > 0:
        xs.append(x)
        ys.append(reorganized_dictionary[model][x][0][0])
        yerrs.append(reorganized_dictionary[model][x][0][1])
    #print('xs: {}'.format(xs))
    #print('ys: {}'.format(ys))
    if len(xs) > 0:
      plt.errorbar(x=xs, y=ys, yerr=yerrs, label=model)

  #print(reorganized_dictionary)
  if exp_type == 'multi':
    plt.title('Multi-class Classification Model Accuracies with Increasing Features')
  elif exp_type == 'binary':
    plt.title('Binary Classification Model Accuracies with Increasing Features')
  plt.ylabel('Accuracy')
  plt.xlabel('Number of Features')

  plt.xticks(xs)

  plt.legend()
  plt.show()


## Data Preparation

In [4]:
# These are the best features for multi-class experiment provided from the feature selection jupyter notebook
best_features_multiclass = ['Entropy_Afterpath', 'argPathRatio', 'NumberRate_AfterPath', 'NumberRate_Domain', 'ArgUrlRatio', 'Extension_DigitCount', 'dld_getArg', 'ldl_getArg', 'Query_DigitCount', 'LongestVariableValue', 'Querylength', 'Query_LetterCount', 'ldl_path', 'ArgLen', 'ldl_url', 'dld_path', 'Extension_LetterCount', 'argDomanRatio', 'dld_url', 'URL_DigitCount', 'LongestPathTokenLength', 'URLQueries_variable', 'fileNameLen', 'delimeter_Count', 'NumberRate_URL', 'SymbolCount_Domain', 'domain_token_count', 'tld', 'SymbolCount_Extension', 'this.fileExtLen', 'pathLength', 'subDirLen', 'urlLen', 'charcompace', 'host_DigitCount', 'SymbolCount_Afterpath', 'URL_Letter_Count', 'pathDomainRatio', 'SymbolCount_FileName', 'domainUrlRatio', 'NumberRate_Extension', 'NumberRate_FileName', 'SymbolCount_URL', 'Entropy_Filename', 'Entropy_DirectoryName', 'delimeter_path', 'Arguments_LongestWordLength', 'charcompvowels', 'CharacterContinuityRate', 'spcharUrl', 'executable', 'pathurlRatio', 'Filename_LetterCount', 'Entropy_Extension', 'dld_filename', 'ldl_filename', 'SymbolCount_Directoryname', 'avgdomaintokenlen', 'path_token_count', 'File_name_DigitCount', 'NumberRate_DirectoryName', 'delimeter_Domain', 'Domain_LongestWordLength', 'NumberofDotsinURL', 'Directory_DigitCount', 'Directory_LetterCount', 'URL_sensitiveWord', 'longdomaintokenlen', 'ldl_domain', 'domainlength', 'Entropy_Domain', 'host_letter_count', 'avgpathtokenlen', 'isPortEighty', 'sub-Directory_LongestWordLength', 'dld_domain', 'Path_LongestWordLength', 'Entropy_URL']

## Data Preparation

In [5]:
# Set up google drive access
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [6]:
# Import the data
path = '/content/gdrive/My Drive/FinalDataset/'
fille = 'All.csv'
df = pd.read_csv(path + fille)
print('Data Read:')
print(df.head())

Data Read:
   Querylength  domain_token_count  ...  Entropy_Afterpath  URL_Type_obf_Type
0            0                   4  ...               -1.0         Defacement
1            0                   4  ...               -1.0         Defacement
2            0                   4  ...               -1.0         Defacement
3            0                   4  ...               -1.0         Defacement
4            0                   4  ...               -1.0         Defacement

[5 rows x 80 columns]


In [7]:
dep_var = 'URL_Type_obf_Type'

print('There are {} columns and {} rows in the provided data.'.format(len(df.columns), len(df)))

There are 80 columns and 36697 rows in the provided data.


In [8]:
print('Below is the dataset\'s composition')
print(df[dep_var].value_counts())

Below is the dataset's composition
Defacement    7930
benign        7781
phishing      7577
malware       6711
spam          6698
Name: URL_Type_obf_Type, dtype: int64


In [9]:
# Removes all rows if they contain NaN values
df.dropna(axis='index', inplace=True)

In [10]:
print('There are {} columns and {} rows in the provided data.'.format(len(df.columns), len(df)))

print('Below is the dataset\'s composition')
print(df[dep_var].value_counts())

There are 80 columns and 18982 rows in the provided data.
Below is the dataset's composition
spam          5342
malware       4440
phishing      4014
benign        2709
Defacement    2477
Name: URL_Type_obf_Type, dtype: int64


In [11]:
# Create the X (data) and y (labels)
X = normalize( df.loc[:, df.columns != dep_var] )
y = df[dep_var]

## Multi-class Experiments

In [None]:
fig = plt.figure()
multi_performance_metrics = {}
for i in range(1, 21):
    features = best_features_multiclass[:i]
    multi_performance_metrics[i] = train_and_eval_on(X=X, y=y, feature_set=features)

    #print(performance_metrics)

    #show_graph(figure=fig, feature_count=len(features), metrics_dict=multi_performance_metrics)

Training with 1 features


In [None]:
show_graph(figure=fig, feature_count=len(features), metrics_dict=multi_performance_metrics, exp_type='multi')

## Binary Classification


In [None]:
# Convert the dataset to binary class problem
print('Before conversion:')
print(y.value_counts())

y = y.map(lambda label : label if label == 'benign' else 'malicious')

print('After conversion:')
print(y.value_counts())

In [None]:
# Since we are now in the binary classification problem, we need to assign the feature set from best to worst
best_features_binclass = ['fileNameLen', 'domain_token_count', 'tld', 'SymbolCount_Domain', 'Entropy_Afterpath', 'delimeter_path', 'argPathRatio', 'Entropy_Filename', 'Entropy_DirectoryName', 'Filename_LetterCount', 'NumberRate_AfterPath', 'NumberRate_Extension', 'Entropy_Extension', 'ArgUrlRatio', 'NumberRate_FileName', 'dld_getArg', 'CharacterContinuityRate', 'NumberRate_DirectoryName', 'dld_filename', 'Extension_DigitCount', 'NumberRate_Domain', 'domainUrlRatio', 'dld_path', 'ldl_getArg', 'SymbolCount_URL', 'Query_DigitCount', 'dld_url', 'ldl_url', 'ldl_path', 'SymbolCount_Directoryname', 'ArgLen', 'this.fileExtLen', 'Querylength', 'File_name_DigitCount', 'Query_LetterCount', 'LongestVariableValue', 'Extension_LetterCount', 'domainlength', 'NumberofDotsinURL', 'Entropy_Domain', 'path_token_count', 'Directory_DigitCount', 'ldl_filename', 'URLQueries_variable', 'delimeter_Count', 'argDomanRatio', 'SymbolCount_FileName', 'host_DigitCount', 'SymbolCount_Extension', 'pathurlRatio', 'host_letter_count', 'URL_DigitCount', 'delimeter_Domain', 'longdomaintokenlen', 'URL_Letter_Count', 'urlLen', 'LongestPathTokenLength', 'URL_sensitiveWord', 'SymbolCount_Afterpath', 'Domain_LongestWordLength', 'pathLength', 'subDirLen', 'executable', 'spcharUrl', 'Arguments_LongestWordLength', 'avgpathtokenlen', 'charcompace', 'ldl_domain', 'Entropy_URL', 'isPortEighty', 'Directory_LetterCount', 'pathDomainRatio', 'dld_domain', 'NumberRate_URL', 'sub-Directory_LongestWordLength', 'Path_LongestWordLength', 'charcompvowels', 'avgdomaintokenlen']

In [None]:
fig = plt.figure()
bin_performance_metrics = {}
for i in range(1, 21):
    features = best_features_binclass[:i]
    bin_performance_metrics[i] = train_and_eval_on(X=X, y=y, feature_set=features)

    #print(performance_metrics)

    #show_graph(figure=fig, feature_count=len(features), metrics_dict=bin_performance_metrics)

In [None]:
show_graph(figure=fig, feature_count=len(features), metrics_dict=bin_performance_metrics, exp_type='binary')