# 1. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import os

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [3]:
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout
from keras.regularizers import l2

# 2. Data Loading

In [4]:
parent_dir = os.path.dirname(os.getcwd())
features_path = os.path.join(parent_dir, 'dataset', 'features.csv')
df_with_features = pd.read_csv(features_path, index_col=0)
print(df_with_features.shape)
print(df_with_features.type.value_counts())
df_with_features.head()

(651191, 22)
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: type, dtype: int64


Unnamed: 0,url,type,use_of_ip,abnormal_url,count_.,count_www,count_@,count_dir,count_embed_domain,short_url,...,count_%,count_?,count_,count_=,url_length,hostname_length,sus_url,fd_length,tld_length,type_code
0,br-icloud.com.br,phishing,0,0,2,0,0,0,0,0,...,0,0,1,0,16,0,0,0,-1,3
1,mp3raid.com/music/krizz_kaliko.html,benign,0,0,2,0,0,2,0,0,...,0,0,0,0,35,0,0,5,-1,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,0,2,0,0,3,0,0,...,0,0,0,0,31,0,0,7,-1,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,0,1,3,1,0,1,0,0,...,0,1,1,4,88,21,0,9,2,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,0,1,2,0,0,1,0,0,...,0,1,1,3,235,23,0,9,3,1


In [5]:
X = df_with_features.copy(deep=True)
X.drop(['url', 'type', 'type_code'], axis=1, inplace=True)
X.head()

Unnamed: 0,use_of_ip,abnormal_url,count_.,count_www,count_@,count_dir,count_embed_domain,short_url,count_https,count_http,count_%,count_?,count_,count_=,url_length,hostname_length,sus_url,fd_length,tld_length
0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,16,0,0,0,-1
1,0,0,2,0,0,2,0,0,0,0,0,0,0,0,35,0,0,5,-1
2,0,0,2,0,0,3,0,0,0,0,0,0,0,0,31,0,0,7,-1
3,0,1,3,1,0,1,0,0,0,1,0,1,1,4,88,21,0,9,2
4,0,1,2,0,0,1,0,0,0,1,0,1,1,3,235,23,0,9,3


In [10]:
complete_feature_set = list(X.columns)
print(complete_feature_set)

['use_of_ip', 'abnormal_url', 'count_.', 'count_www', 'count_@', 'count_dir', 'count_embed_domain', 'short_url', 'count_https', 'count_http', 'count_%', 'count_?', 'count_', 'count_=', 'url_length', 'hostname_length', 'sus_url', 'fd_length', 'tld_length']


In [7]:
y = df_with_features.copy(deep=True)
y = y['type_code']
y.head()

0    3
1    0
2    0
3    1
4    1
Name: type_code, dtype: int64

In [8]:
# Construct a hold-out dataset for final testing
X_tune, X_holdout, y_tune, y_holdout = train_test_split(X, y, test_size=0.02, random_state=99)
print(X_tune.shape)
print(X_holdout.shape)
print(y_tune.shape)
print(y_holdout.shape)

(638167, 19)
(13024, 19)
(638167,)
(13024,)


In [27]:
def train_and_eval(X, y, feature_set=[], models_to_use=[], folds=10):
    """
    Perform cross validation on selected models using given data X and targets y. 
    :param X: the features 
    :param age: the targets 
    :param feature_set: the list of features to use for training 
    :param models_to_use: a list of selected models to be trained
    :param folds: how many folds to use for StratifiedKFold
    :return: a dictionary where the keys are the model names and each value is a list of a 
    accuracies across all folds  
    """
    
    if len(feature_set) != 0:        
        X = X[feature_set]

    random_state = 100
    metrics = {
            'rf':[],
            'dt':[],
            'knn':[],
            'svm':[],
            'lr':[],
            'lda':[],
            'ab':[],
            'nb':[],
            'keras':[],
            }
    
    print('Training with {} features'.format(len(X.columns)))
    print('Using {} fold cross validation'.format(folds))
    skf = StratifiedKFold(n_splits=folds, shuffle=False, random_state=None)
    count = 1

    print("Using models:", models_to_use)

    for train_idx, test_idx in skf.split(X, y):
        print("On fold {}".format(count))
        count += 1
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Instantiate various ML models for training 
        if "rf" in models_to_use:        
            # Random Forest Model
            rf = RandomForestClassifier(random_state=random_state, verbose=2)
            rf.fit(X_train, y_train)
            score = rf.score(X_test, y_test)
            metrics['rf'].append(score)

        if "dt" in models_to_use:
            # Decision Tree Model
            dt = DecisionTreeClassifier(random_state=random_state)
            dt.fit(X_train, y_train)
            score = dt.score(X_test, y_test)
            metrics['dt'].append(score)

        if "knn" in models_to_use:
            # k-Nearest Neighbors Model
            knn = KNeighborsClassifier()
            knn.fit(X_train, y_train)
            score = knn.score(X_test, y_test)
            metrics['knn'].append(score)

        if "svm" in models_to_use:
            # Support Vector Machine Model
            svm = SVC(random_state=random_state)
            svm.fit(X_train, y_train)
            score = svm.score(X_test, y_test)
            metrics['svm'].append(score)

        if "lr" in models_to_use:
            # Logistic Regression Model
            lr = LogisticRegression(random_state=random_state, verbose=2)
            lr.fit(X_train, y_train)
            score = lr.score(X_test, y_test)
            metrics['lr'].append(score)

        if "lda" in models_to_use:
            # Linear Discriminant Analysis Model
            lda = LinearDiscriminantAnalysis()
            lda.fit(X_train, y_train)
            score = lda.score(X_test, y_test)
            metrics['lda'].append(score)

        if "ab" in models_to_use:
            # AdaBoost Model
            ab = AdaBoostClassifier(random_state=random_state)
            ab.fit(X_train, y_train)
            score = ab.score(X_test, y_test)
            metrics['ab'].append(score)

        if "nb" in models_to_use:
            # Naive Bayes Model
            nb = GaussianNB()
            nb.fit(X_train, y_train)
            score = nb.score(X_test, y_test)
            metrics['nb'].append(score)
        
        if "dnn" in models_to_use:
            # Keras-TensorFlow DNN Model
            dnn_keras = Sequential(layers=[
                    Dense(64, kernel_regularizer=l2(0.001), activation='relu',input_shape=(len(X_train.columns),)),
                    BatchNormalization(),
                    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
                    BatchNormalization(),
                    Dense(y_train.nunique(), activation='softmax')
            ])
            print(dnn_keras.summary())
            dnn_keras.compile(
                optimizer='adam', 
                loss='categorical_crossentropy', 
                metrics=['accuracy'])
            dnn_keras.fit(X_train, pd.get_dummies(y_train), epochs=10, verbose=1, batch_size=256)
            _, score = dnn_keras.evaluate(X_test, pd.get_dummies(y_test), verbose=1)
            metrics['keras'].append(score)

    return metrics

In [30]:
features_to_use = ['count_.', 'count_www', 'count_@', 'count_dir', 
                    'count_https', 'count_http', 'count_%', 'count_?', 'count_', 'count_=',]

metrics = train_and_eval(X=X_tune, y=y_tune, feature_set=[] ,models_to_use=["dt"])

Training with 19 features
Using 10 fold cross validation
Using models: ['dt']
On fold 1
On fold 2
On fold 3
On fold 4
On fold 5
On fold 6
On fold 7
On fold 8
On fold 9
On fold 10


In [31]:
print(metrics['dt'])

[0.9580675995424417, 0.959446542457339, 0.9578325524546751, 0.9576445147844618, 0.9588197502232947, 0.9586317125530814, 0.9570960715796731, 0.9585997242070954, 0.9597122978563369, 0.9585840541556976]
