In [1]:
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from nltk.util import ngrams
import os
import sys
import random
from datetime import datetime
import baker
from sklearn.metrics import roc_curve, auc
import matplotlib
matplotlib.use('pdf')
import matplotlib.pyplot as plt
import logging
import json
from sklearn.model_selection import train_test_split
import mmh3
%matplotlib inline

Using TensorFlow backend.


In [None]:
def data_clean(file,target,col_1,col_2,col_3):
   ## generate random seeds for reproducibility
    np.random.seed(3)

   ## import data from two sources as dataframes
    data = pd.read_csv(file)
    y_label= np.array(data[target].values.astype(int))
    data[col_1]=[extract_tokens(i) for i in data[col_1]]
    data[col_2]=[extract_host(i) for i in data[col_2]]
    data[col_2]=[i[::-1] for i in data[col_2]]
    data[col_3]=data[col_1]+data[col_2]
    
    ## filter out any null values in target variable
    bool_series = pd.notnull(data[target]) 
    data=data[bool_series] 
    
    ## 70/30 training and testing split on data 
    X=np.array(eng_hash(data[col_3].values))
    X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size=0.3)
    
    ## transform training and testing set into vectors with pre-fixed dimensions
    X_train=X_train.reshape(X_train.shape[0],X_train.shape[2])
    X_test=X_test.reshape(X_test.shape[0],X_test.shape[2])
    
    return (X_train,X_test,y_train,y_test)

In [None]:
def extract_tokens(element):
    tokens=str(element).rsplit("/")
    return tokens

In [None]:
def extract_host(element):
    host=str(element).rsplit(".")
    return host

In [None]:
def eng_hash(data, vdim=1000):
    ## take 3 n-gram of the url and hash it into a vector of length 1000
    final = []
    for url in data:
        v = [0] * vdim
        new = list(ngrams(url, 3))
        for i in new:
            new_ = ''.join(i)
            idx = mmh3.hash(new_) % vdim
            v[idx] += 1
        final.append([np.array(v)])
    return final

In [None]:
class LossHistory(keras.callbacks.Callback):
    # a class will capture the training loss
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

In [None]:
def construct_model():
    model = Sequential()
 
    ## hidden layers with dropout rate=0.15 and batchnormalization to prevent overfitting issues
    model.add(Dense(128, input_dim=1000))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(.15))
        
    model.add(Dense(64))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.15))
    
    model.add(Dense(64))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(.15))
    
    model.add(Dense(32))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(.15))

    ## final output layer for binary classification problem
    model.add(Dense(1, activation='sigmoid'))
    
    ## loss function: crossentropy, optimization_procedure: stochastic gradient descent, metrics: accuracy
    model.compile(loss='binary_crossentropy',
                  optimizer='SGD',
                  metrics=['accuracy'])
    return model

In [None]:
def train_model(X_train, y_train, model):
    log.info("Beginning training model")
    loss = LossHistory()
    model.fit(X_train, y_train,
              epochs=20,
              batch_size=128, verbose=1, callbacks=[loss])
    return model, loss

In [2]:
def find_nearest(array,value):
    ## find the nearest value in an array to the given value
    return (np.abs(array-value)).argmin()

In [None]:
def counts(actual, preds):   
    ## calculate the count of actual and preds
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for y, pred in zip(actual, preds):
        if pred == 1 and y == 1:
            tp += 1
        if pred == 0 and y == 1:
            fn += 1
        if pred == 1 and y == 0:
            fp += 1
        if pred == 0 and y == 0:
            tn += 1
    return np.column_stack((tp, fp, tn, fn)).tolist()

In [None]:
def results(X_train,y_train,X_test,y_test):
    preds=  model.predict(X_test, batch_size=64)
    
    ## ensure format of preds is able to be handled by sklearn
    if len(preds.shape)==1:
        preds_ = np.array([preds]).T
        
    if preds.shape[1]==1:
        p_neg = 1.0-preds
        preds_ = np.hstack((p_neg, preds))
        preds_ = preds_.astype(np.float)
    
    ## get roc curve using sklearn
    results = {}
    fpr, tpr, thresh = roc_curve(y_test, preds_[:,1], 1.0)
    curr_auc = auc(fpr, tpr)
    results['roc'] = np.column_stack((fpr, tpr, thresh)).tolist()
    results['auc'] = np.array([curr_auc]).tolist()
    
    ## return false positive rate and true positive rate
    fpr=[x[0] for x in results['roc']]
    tpr=[x[1] for x in results['roc']]
    
    labels=['Deep Learning w/time Split']
    plt.plot(np.logspace(-10,0, 1000), np.logspace(-10,0, 1000), 'k--')
    plt.xlim([0,1.0])
    plt.ylim([0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.step(fpr,tpr,linestyle='-',color='blue',label='Model {}  (AUC = {:0.4f}), '.format(labels[0], results['auc'][0]))
    plt.legend(loc='lower right',prop={'size':8})
    plt.xlim([1e-6, 1])
    plt.xscale('log')
    plt.savefig('ROC_fig.png',dpi=300)
    plt.show()
    plt.close()