# Dataset Surtur

In [2]:
import pandas as pd 
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500


import numpy as np
from glob import glob as globlin
from bs4 import BeautifulSoup
import re
from urlextract import URLExtract
import statistics
import os
import csv
import onemillion

from tqdm import tqdm
tqdm.pandas()

import signal
from contextlib import contextmanager

from xgboost import XGBClassifier

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

import matplotlib.pyplot as plt

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot
import plotly.express as px


@contextmanager
def timeout(time):
    # Register a function to raise a TimeoutError on the signal.
    signal.signal(signal.SIGALRM, raise_timeout)
    # Schedule the signal to be sent after ``time``.
    signal.alarm(time)

    try:
        yield
    except TimeoutError:
        return 'timeout error'
    finally:
        # Unregister the signal so it won't be triggered
        # if the timeout is not reached.
        signal.signal(signal.SIGALRM, signal.SIG_IGN)


def raise_timeout(signum, frame):
    raise TimeoutError

## Reading New Data

In [3]:
def read_data(main_path):
    """reading the benign samples"""
    dataframe_array = []
    benign_data_paths = globlin(main_path)
    for path in benign_data_paths:
        dataframe_array.append(pd.read_csv(path))
        #print(pd.read_csv(path).columns)
    return pd.concat(dataframe_array)

In [5]:
dataset_surtur = read_data('./data_construction/4 - final_data/*.csv').drop(columns=['Unnamed: 0.1'])
df_mal_sup = read_data('./data_construction/supp_malicious_1.csv')


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [None]:
df_mal_sup = df_mal_sup.drop(columns=['Label'])

In [None]:
dataset_surtur = dataset_surtur.drop(columns=['status', 'https.1'])

In [None]:
df_mal_sup.columns = ['url', 'has_IP_in_url', 'number_subdomains', 'hostname',
       'length_hostname', 'ratio_digits_url', 'having_@_in_url',
       'ratio_digits_hostname', 'number_underscores', 'tld', 'url_len',
       'https', 'who_is', 'content', 'label']

df_mal_sup.columns

In [None]:
dataset_surtur = pd.concat([dataset_surtur, df_mal_sup], axis=0, sort=False).drop_duplicates()
dataset_surtur.head()

In [None]:
dataset_surtur['label'].value_counts()

## Extracting JS

In [None]:
def get_js(content, supp=False):
    tags_of_interest = [
        '<script type="text/javascript">', 
        '<script>'
    ]
    
    soup = BeautifulSoup(content, 'html.parser')
    js = soup.find_all('script')
    complete_js = ''
    for tag in js:
        for tag_int in tags_of_interest:
            if tag_int in str(tag):
                complete_js += str(tag).replace(tag_int, '').replace(
                    '</script>', '')
    return complete_js.encode('utf-8')

In [None]:
dataset_surtur['js'] = dataset_surtur['content'].progress_apply(lambda content: get_js(str(content)))

## JS Length

In [None]:
dataset_surtur['js_len'] = dataset_surtur['js'].progress_apply(lambda js: len(js))

In [None]:
ex_string_thing = dataset_surtur[dataset_surtur['js_len']==0].iloc[1]['content']

## Counting Script tag references in html page 

In [None]:
def script_references(js):
    return js.count('<script')/2

In [None]:
dataset_surtur['js_ref'] = dataset_surtur['content'].progress_apply(
    lambda content: script_references(str(content)))

## Getting array lengths from JS 

In [None]:
def get_max_array_length(js):
    array_lengths = re.findall('\(([^\)]+)\)', js)
    if array_lengths == []:
        return 0
    return max([len(i) for i in array_lengths])

def get_avg_array_length(js):
    array_lengths = re.findall('\(([^\)]+)\)', js)
    if array_lengths == []:
        return 0
    return statistics.mean([len(i) for i in array_lengths])

In [None]:
dataset_surtur['js_array_len_avg'] = dataset_surtur['js'].progress_apply(
    lambda js: get_avg_array_length(str(js)))
dataset_surtur['js_array_len_max'] = dataset_surtur['js'].progress_apply(
    lambda js: get_max_array_length(str(js)))

## Content Length

In [None]:
dataset_surtur['content_len'] = dataset_surtur['content'].progress_apply(
    lambda x: len(str(x)))

## Number of Function calls

In [None]:
dataset_surtur['js'][1]

In [None]:
def get_func_calls(js):
    full_paren = len(re.findall("\(([^\)]+)\)", js))
    empty_paren = len(js.split('()'))
    return full_paren + empty_paren

In [None]:
dataset_surtur['num_js_func_calls'] = dataset_surtur['js'].progress_apply(
    lambda x: get_func_calls(str(x)))

## Suspicious Function Count

In [None]:
def get_sus_js_function_count(js):
    function_list = [
        'setcookie', 'getcookie', 'createxmlhttprequest', 'unescape',
        'document.write', 'element.appendchild', 'dateobject.togmtstring',
        'new activexobject', 'document.createelement', 'getappname',
        'getuseragent', 'window.setinterval', 'window.settimeout',
        'location.assign', 'location.replace', 'eval()', 'string.indexof',
        'string.fromcharcode', 'charat', 'split',
        'string.charcodeat', 'document.writeln', 'document.appendchild',
        'innerhtml', 'insertAdjacentHTML', 'outerhtml', 
    ]

    split_js = js.replace('\n',' ').split(' ')
    counter = 0
    for element in split_js:
        if any(m_function in element.lower() for m_function in function_list):
            counter += 1

    return counter

In [None]:
dataset_surtur['malicious_func_count'] = dataset_surtur['js'].progress_apply(
    lambda js: get_sus_js_function_count(str(js)))

## Get total and external URL count in content

In [None]:
# def find_urls(string, ext_count):
#     # with timeout(1):
#     regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
#     url = re.findall(regex, string)
#     if ext_count:
#         return len(set(url))
#     return len(url)


# # Test Code
# string = 'My Profile: https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles in the portal of http://www.geeksforgeeks.org/'
# print("Urls: ", find_urls(string, False))

In [None]:
def find_urls(string, ext_count):
    try:
        extractor = URLExtract()
        urls = extractor.find_urls(string)
        if ext_count: 
            return len(set(urls))
        return len(urls)
    except:
        return 10000

string_ex = 'My Profile: https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles in the portal of http://www.geeksforgeeks.org/'
    
print(find_urls(string_ex, False))

In [None]:
dataset_surtur['total_url_count'] = dataset_surtur['content'].progress_apply(
    lambda js: find_urls(str(js), False)
)

In [None]:
dataset_surtur['ext_url_count'] = dataset_surtur['content'].progress_apply(
    lambda js: find_urls(str(js), True)
)

In [None]:
dataset_surtur.columns

## More Lexical Features

In [None]:
dataset_surtur['num_semicolons'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count(';'))
dataset_surtur['num_zeros'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('0'))
dataset_surtur['num_spaces'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('%20'))
dataset_surtur['num_hyphens'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('-'))
dataset_surtur['num_@s'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('@'))
dataset_surtur['num_queries'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('?'))
dataset_surtur['num_ampersands'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('&'))
dataset_surtur['num_equals'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('='))

## Safe Browsing API Judgement

In [11]:
with open('./safebrowsingkey.txt') as f:
    api_key = f.read()

In [12]:
import requests

key = 'your key here'
URL = "https://sb-ssl.google.com/safebrowsing/api/lookup?client=api&apikey={key}&appver=1.0&pver=3.0&url={url}"


def is_safe(url):
    try:
        response = requests.get(URL.format(key=api_key, url=url))
        return response.text != 'malware'
    except:
        return 'unknown'


print(is_safe('http://addonrock.ru/Debugger.js/'))  # prints False
print(is_safe('http://google.com'))  # prints True


dataset_surtur['google_is_safe'] = dataset_surtur['url'].progress_apply(lambda url: is_safe(url))

  0%|          | 0/105492 [00:00<?, ?it/s]

True
True


100%|██████████| 105492/105492 [4:26:58<00:00,  6.59it/s]   


## Retrieving Domain names and their lengths

In [None]:
dataset_surtur['domain'] = dataset_surtur['hostname'].progress_apply(
    lambda hostname: '.'.join(str(hostname).split('.')[-2:]))

In [None]:
dataset_surtur['domain_len'] = dataset_surtur['domain'].progress_apply(lambda domain: len(domain))

## Presence in Alexa domains

In [None]:
CONFIG = {
    'domain_lists': [
        {
            'name': "alexa",
            'output_file_path': "alexa.csv",
            'url': "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
        }, {
            'name': "cisco umbrella",
            'output_file_path': "cisco.csv",
            'url': "http://s3-us-west-1.amazonaws.com/umbrella-static/" +
                   "top-1m.csv.zip"
        }
    ]
}

# Cache our top 1 million known domains
DEFAULT_CACHE_LOCATION = '~/.onemillion'
cache_location = os.path.expanduser(DEFAULT_CACHE_LOCATION)

# O(n) to read Csv file rows into set 
def read_onemillion_data():
    """Check if the given domain is in a top on million list."""
    # TODO: parse the registered domain out of the domain parameter

    # keep track of the highest (nearest to 1) rank for the given domain
    highest_rank = None

    # see if the given domain is in the up-to-date domain lists
    domains = set()
    for domain_list in CONFIG['domain_lists']:
        # open the domain list as a CSV
        with open(os.path.join(cache_location, domain_list['output_file_path']), 'r') as domain_csv:
            domain_reader = csv.reader(domain_csv)
            for row in domain_reader:
                domains.add(row[1])
    return domains


domains_onemillion = read_onemillion_data()

# O(1) Function Run time
def domain_checker(domain):
    domain = domain.lower()
    if domain in domains:
        return True
    return False

In [None]:

o = onemillion.OneMillion()

def check_if_in_onemillion(domain):
    if domain in domains_onemillion:
        return 1
    else:
        return 0
    
    
dataset_surtur['is_in_alexa'] = dataset_surtur['domain'].progress_apply(
    lambda domain: check_if_in_onemillion(domain))    

## Get IP address

In [226]:
dataset_surtur.iloc[0]['url']

'http://islandvolleyballclub.com/venues.html'

In [245]:
dataset_surtur.iloc[2]['url'].split('/')[2]

'naturalfilters.bizland.com'

In [266]:
socket.gethostbyname(dataset_surtur['url'].iloc[28].split('/')[2])

'188.165.128.87'

In [264]:
dataset_surtur['url'].iloc[27].split('/')[2]

'www.rgbfms.co.uk'

In [None]:
import socket

def get_ip_url(url):
    try:
        domain = url.split('/')[2]
        return socket.gethostbyname(domain)
    except:
        return 'unknown'

dataset_surtur['ip_address'] = dataset_surtur['url'].progress_apply(
    lambda url: get_ip_url(url))

  1%|          | 606/105492 [01:53<6:06:40,  4.77it/s] 

In [None]:
dataset_surtur['ip_address'].value_counts()

## One hot encoding

In [20]:
def column_adjustor(dataset_column):
    unique_values = dataset_column.unique()
    return dataset_column.progress_apply(lambda x: np.where(unique_values == x)[0][0])

In [21]:
dataset_surtur = dataset_surtur.dropna()

In [23]:
dataset_surtur['who_is'] = column_adjustor(dataset_surtur['who_is'])
dataset_surtur['https'] = column_adjustor(dataset_surtur['https'])
dataset_surtur['tld'] = column_adjustor(dataset_surtur['tld'])
dataset_surtur['google_is_safe'] = column_adjustor(dataset_surtur['google_is_safe'])


dataset_surtur['label'] = dataset_surtur['label'].progress_apply(
    lambda label: 1 if 'bad' in label else 0)

100%|██████████| 105492/105492 [00:00<00:00, 334291.22it/s]
100%|██████████| 105492/105492 [00:00<00:00, 338570.18it/s]
100%|██████████| 105492/105492 [00:00<00:00, 281936.69it/s]
100%|██████████| 105492/105492 [00:00<00:00, 337503.07it/s]


In [24]:
dataset_surtur

Unnamed: 0,content,has_IP_in_url,having_@_in_url,hostname,https,label,length_hostname,number_subdomains,number_underscores,ratio_digits_hostname,ratio_digits_url,tld,url,url_len,who_is,js,js_len,js_ref,js_array_len_avg,js_array_len_max,content_len,num_js_func_calls,malicious_func_count,total_url_count,ext_url_count,num_semicolons,num_zeros,num_spaces,num_hyphens,num_@s,num_queries,num_ampersands,num_equals,domain,domain_len,google_is_safe
0,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",0.0,0.0,islandvolleyballclub.com,0,0,24.0,0.0,0.0,0.000000,0.000000,0,http://islandvolleyballclub.com/venues.html,43.0,0,b'',0,0.0,0.000000,0,7813,1,0,15,14,0,0,0,0,0,0,0,0,islandvolleyballclub.com,24,0
1,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01//...",0.0,0.0,www.indiosoftware.com,0,0,21.0,1.0,0.0,0.000000,0.000000,0,http://www.indiosoftware.com/,29.0,0,b'\nfunction getCookie(c_name) { // Local func...,1103,0.5,17.200000,69,1556,23,4,2,2,0,0,0,0,0,0,0,0,indiosoftware.com,17,0
2,"<html>\n<head>\n<meta content=""text/html; char...",0.0,0.0,naturalfilters.bizland.com,0,0,26.0,1.0,0.0,0.000000,0.000000,0,http://naturalfilters.bizland.com/aquarain/,43.0,0,"b'\nfunction myopen(filename,windowname,proper...",133,0.5,30.000000,30,26533,4,0,40,28,0,0,0,0,0,0,0,0,bizland.com,11,0
3,<!DOCTYPE HTML>\n\n<html><!-- InstanceBegin te...,0.0,0.0,www.rosepath.com,0,0,16.0,1.0,0.0,0.000000,0.000000,0,http://www.rosepath.com,23.0,0,"b'qm_create(0,false,0,0,false,false,false,fals...",53,1.5,41.000000,41,6465,2,0,5,5,0,0,0,0,0,0,0,0,rosepath.com,12,0
4,could not fetch content,0.0,0.0,www.plannedparrothood.com,0,0,25.0,1.0,0.0,0.000000,0.000000,0,http://www.plannedparrothood.com/,33.0,0,b'',0,0.0,0.000000,0,23,1,0,0,0,0,0,0,0,0,0,0,0,plannedparrothood.com,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105487,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",0.0,0.0,sites.google.com,0,1,16.0,1.0,0.0,0.000000,0.000000,0,http://sites.google.com/site/habbotuttogratis,45.0,0,b'/* Copyright 2008 Google. */ (function() { /...,6486,11.0,15.379310,89,22452,73,0,68,38,0,0,0,0,0,0,0,0,google.com,10,0
105488,"<!DOCTYPE html>\n\n<html dir=""ltr"">\n<head>\n<...",0.0,0.0,www.habbocreditosparati.blogspot.com,0,1,36.0,2.0,0.0,0.000000,0.000000,8,http://www.habbocreditosparati.blogspot.com/,44.0,1,"b'\n function setAttributeOnload(object, at...",7263,5.5,263.869565,4641,26841,30,1,81,59,0,0,0,0,0,0,0,0,blogspot.com,12,0
105489,"<!DOCTYPE html>\n\n<html class=""v2"" dir=""ltr"" ...",1.0,0.0,creditiperhabbogratissicuro100.blogspot.com,0,1,43.0,1.0,0.0,0.069767,0.126316,8,http://creditiperhabbogratissicuro100.blogspot...,95.0,1,"b'\n function setAttributeOnload(object, at...",11025,8.5,228.605263,5709,45192,53,2,74,45,0,6,0,4,0,0,0,0,blogspot.com,12,0
105490,"<!DOCTYPE html>\n\n<html dir=""ltr"">\n<head>\n<...",0.0,0.0,mundovirtualhabbo.blogspot.com,0,1,30.0,1.0,3.0,0.000000,0.131148,8,http://mundovirtualhabbo.blogspot.com/2009_01_...,61.0,1,"b'\n function setAttributeOnload(object, at...",12635,16.0,122.558824,4565,41364,104,3,120,91,0,4,0,0,0,0,0,0,blogspot.com,12,0


# Checkpoint 

In [104]:
# dataset_surtur.to_csv('./dataset_surtur.csv', index=False)

In [105]:
dataset_surtur = pd.read_csv('./dataset_surtur.csv')
dataset_surtur.head()

Unnamed: 0,content,has_IP_in_url,having_@_in_url,hostname,https,label,length_hostname,number_subdomains,number_underscores,ratio_digits_hostname,ratio_digits_url,tld,url,url_len,who_is,js,js_len,js_ref,js_array_len_avg,js_array_len_max,content_len,num_js_func_calls,malicious_func_count,total_url_count,ext_url_count,num_semicolons,num_zeros,num_spaces,num_hyphens,num_@s,num_queries,num_ampersands,num_equals,domain,domain_len,google_is_safe
0,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",0.0,0.0,islandvolleyballclub.com,0,0,24.0,0.0,0.0,0.0,0.0,0,http://islandvolleyballclub.com/venues.html,43.0,0,b'',0,0.0,0.0,0,7813,1,0,15,14,0,0,0,0,0,0,0,0,islandvolleyballclub.com,24,0
1,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01//...",0.0,0.0,www.indiosoftware.com,0,0,21.0,1.0,0.0,0.0,0.0,0,http://www.indiosoftware.com/,29.0,0,b'\nfunction getCookie(c_name) { // Local func...,1103,0.5,17.2,69,1556,23,4,2,2,0,0,0,0,0,0,0,0,indiosoftware.com,17,0
2,"<html>\n<head>\n<meta content=""text/html; char...",0.0,0.0,naturalfilters.bizland.com,0,0,26.0,1.0,0.0,0.0,0.0,0,http://naturalfilters.bizland.com/aquarain/,43.0,0,"b'\nfunction myopen(filename,windowname,proper...",133,0.5,30.0,30,26533,4,0,40,28,0,0,0,0,0,0,0,0,bizland.com,11,0
3,<!DOCTYPE HTML>\n\n<html><!-- InstanceBegin te...,0.0,0.0,www.rosepath.com,0,0,16.0,1.0,0.0,0.0,0.0,0,http://www.rosepath.com,23.0,0,"b'qm_create(0,false,0,0,false,false,false,fals...",53,1.5,41.0,41,6465,2,0,5,5,0,0,0,0,0,0,0,0,rosepath.com,12,0
4,could not fetch content,0.0,0.0,www.plannedparrothood.com,0,0,25.0,1.0,0.0,0.0,0.0,0,http://www.plannedparrothood.com/,33.0,0,b'',0,0.0,0.0,0,23,1,0,0,0,0,0,0,0,0,0,0,0,plannedparrothood.com,21,0


In [106]:
# dataset_surtur = dataset_surtur[dataset_surtur['content']!='could not fetch content']

In [107]:
dataset_surtur['who_is'].unique()

array([0, 1])

In [108]:
dataset_surtur['label'].value_counts()

0    61083
1    44409
Name: label, dtype: int64

## Normalizing 

In [109]:
to_keep = dataset_surtur.drop(
    columns=['url', 'content', 'hostname', 'js', 'label']).columns[::-1]

In [110]:
to_keep = dataset_surtur.drop(
    columns=['url', 'content', 'hostname', 'js', 'label', 'domain']).columns[::-1]
# 'total_url_count', 'ext_url_count'

x = dataset_surtur[dataset_surtur['label'] == 1][to_keep].copy()  #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x.values)
df_to_plot_malicious = pd.DataFrame(x_scaled)
df_to_plot_malicious.columns = dataset_surtur[to_keep].columns
df_to_plot_malicious['label'] = 1.0

x = dataset_surtur[dataset_surtur['label'] == 0][to_keep].copy()  #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x.values)
df_to_plot_benign = pd.DataFrame(x_scaled)
df_to_plot_benign.columns = dataset_surtur[to_keep].columns
df_to_plot_benign['label'] = 0.0

normalized_per_class_dataset = pd.concat([df_to_plot_malicious, df_to_plot_benign], axis=0)

x = dataset_surtur[to_keep].copy()  #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x.values)
dataset_surt_norm = pd.DataFrame(x_scaled)
dataset_surt_norm.columns = dataset_surtur[to_keep].columns
dataset_surt_norm['label'] = dataset_surtur['label'].values

## Outlier removal 

In [111]:
# from scipy import stats

# num_df_surt = dataset_surt_norm.select_dtypes(include=["number"])
# cat_df_surt = dataset_surt_norm.select_dtypes(exclude=["number"])

# # idx = np.all(stats.zscore(num_df_surt) < 3, axis=1)

# # dataset_surt_norm_cleaned = pd.concat([num_df_surt.loc[idx], 
# #                                    cat_df_surt.loc[idx]], axis=1)

# Q1 = num_df_surt.quantile(0.10)
# Q3 = num_df_surt.quantile(0.80)
# IQR = Q3 - Q1
# idx = ~((num_df_surt < (Q1 - 1.5 * IQR)) | (num_df_surt > (Q3 + 1.5 * IQR))).any(axis=1)
# dataset_surt_norm_cleaned = pd.concat([num_df_surt.loc[idx], 
#                                     cat_df_surt.loc[idx]], axis=1)

In [112]:
# len(dataset_surt_norm_cleaned)

In [113]:
# indexes_to_drop = dataset_surtur.nlargest(15000, 'js_len')['js_len'].index
# dataset_surtur = dataset_surtur.drop(indexes_to_drop, axis='index')

## Feature Analysis

In [114]:
len(features)

31

In [94]:
features = dataset_surtur.drop(columns=['url', 'content', 'hostname', 'js', 
                                        'label']).columns[::-1] #, 'total_url_count', 'ext_url_count']).columns[::-1]
df_to_plot = dataset_surtur

n_bins = 40

fig, axs = plt.subplots(5, 6, figsize=(20,20))

# We can set the number of bins with the `bins` kwarg
feature_counter = 0
for i in range(len(axs)):
    for j in range(len(axs[i])): 
        current_feature = df_to_plot[features[feature_counter]]
        axs[i, j].hist(current_feature[df_to_plot['label']==0.0], n_bins, fc=(0, 1, 0, 0.5))
        axs[i, j].hist(current_feature[df_to_plot['label']==1.0], n_bins, fc=(1, 0, 0, 0.5))
        axs[i, j].set_title(f'Feature: {features[feature_counter]}')
        if features[feature_counter] == 'asdf':
            axs[i, j].set_ylim([0, current_feature.value_counts().iloc[0]]) 
        else:
            axs[i, j].set_ylim([0, max(current_feature.value_counts())]) 
        feature_counter += 1
    if feature_counter > len(features):
        break

plt.show()

## Plotly Analysis

In [115]:
dataset_surtur.columns

Index(['content', 'has_IP_in_url', 'having_@_in_url', 'hostname', 'https',
       'label', 'length_hostname', 'number_subdomains', 'number_underscores',
       'ratio_digits_hostname', 'ratio_digits_url', 'tld', 'url', 'url_len',
       'who_is', 'js', 'js_len', 'js_ref', 'js_array_len_avg',
       'js_array_len_max', 'content_len', 'num_js_func_calls',
       'malicious_func_count', 'total_url_count', 'ext_url_count',
       'num_semicolons', 'num_zeros', 'num_spaces', 'num_hyphens', 'num_@s',
       'num_queries', 'num_ampersands', 'num_equals', 'domain', 'domain_len',
       'google_is_safe'],
      dtype='object')

In [116]:
# js_array_len_avg

In [180]:
feature = 'who_is'
# feature = dataset_surt_norm.columns[2]
dataset_to_plot = dataset_surtur

good_filter = dataset_to_plot[feature][dataset_to_plot['label'] == 0]#.progress_apply(lambda x: roundup(x))
bad_filter = dataset_to_plot[feature][dataset_to_plot['label'] == 1]#.progress_apply(lambda x: roundup(x))


# bad_filter = bad_filter[bad_filter!=bad_filter.max()]

trace1 = go.Histogram(
    x=good_filter,
    name='Benign',
    yaxis='y2'

)

trace2 = go.Histogram(
    x=bad_filter,
    name='Malicious',
    yaxis='y2'
)

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(trace1)
fig.add_trace(trace2,secondary_y=True)

fig['layout'].update(height = 700, width = 1000, title = f'Feature: {feature}',xaxis=dict(tickangle=-90))
iplot(fig)

## t-SNE

In [223]:
from sklearn.manifold import TSNE

In [225]:
X_embedded_2D = TSNE(n_components = 2, random_state = 0, n_jobs=-1, init='pca').fit_transform(X)
X_embedded_3D = TSNE(n_components = 3, random_state = 0, n_jobs=-1, init='pca').fit_transform(X)

KeyboardInterrupt: 

In [None]:
X_embedded_2D.to_csv('./2D_tsne_surtur.csv', index=False)
X_embedded_3D.to_csv('./3D_tsne_surtur.csv', index=False)

### 2D Plot 

### 3D Plot

In [None]:
fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z,
                                   mode='markers')])
fig.show()

# Machine Learning 

## Standard Set

### Train-test-split

In [181]:
dataset_surtur.columns

Index(['content', 'has_IP_in_url', 'having_@_in_url', 'hostname', 'https',
       'label', 'length_hostname', 'number_subdomains', 'number_underscores',
       'ratio_digits_hostname', 'ratio_digits_url', 'tld', 'url', 'url_len',
       'who_is', 'js', 'js_len', 'js_ref', 'js_array_len_avg',
       'js_array_len_max', 'content_len', 'num_js_func_calls',
       'malicious_func_count', 'total_url_count', 'ext_url_count',
       'num_semicolons', 'num_zeros', 'num_spaces', 'num_hyphens', 'num_@s',
       'num_queries', 'num_ampersands', 'num_equals', 'domain', 'domain_len',
       'google_is_safe'],
      dtype='object')

In [182]:
X = dataset_surtur.drop(columns = ['label', 'content', 'hostname', 'url', 
                                   'js', 'domain', 'google_is_safe']).astype('float32')
# X = np.array(X).reshape([len(X), 1])
y = dataset_surtur['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### XGBoost

In [221]:
xgboost_model = XGBClassifier(verbosity=1, 
                              max_depth=6,
                              n_estimators=10000,
                              colsample_bylevel=1, 
                              num_parallel_tree=1,
                              learning_rate=0.3,
                              tree_method='approx', 
                              booster='dart',
                              n_jobs=-1)

xgboost_model.fit(X_train, y_train)







KeyboardInterrupt: 

In [None]:
y_pred = xgboost_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=['benign', 'malicious'], digits=4))

In [None]:
feat_imp = xgboost_model.feature_importances_
feat_dict = {}

for i in range(len(feat_imp)):
    feat_dict[X.columns[i]] = feat_imp[i]
    
feat_dict = {k: v for k, v in sorted(feat_dict.items(), key=lambda item: item[1], reverse=True)}
    
print("{:<25} {:<25}".format('Feature' ,'Importance'))
for k, v in feat_dict.items():
    num = v
    print("{:<25} {:<25}".format(k, num))

## Checking misclassified samples

In [44]:
# for index, value in enumerate(y_pred):
#     if value != y_test[index]:
#         pass
#         #print(index)

In [150]:
# dataset_surtur.iloc[3]

### Random Forest

In [None]:
rfc_model = RandomForestClassifier(max_depth=2, random_state=0)
rfc_model.fit(X_train, y_train)

In [None]:
y_pred = rfc_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, 
                            target_names=['benign', 'malicious'], 
                            digits=4))

### SVM

In [None]:
param_grid = {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}

svm_model = svm.SVC(verbose=3)
clf = GridSearchCV(svm_model, param_grid, n_jobs=-1, cv=3, verbose=3)

clf.fit(X_train, y_train)

In [None]:
print(clf.best_params_)
print(clf.best_estimator_)

In [None]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, 
                            target_names=['benign', 'malicious'], 
                            digits=4))

## Neural Network

In [None]:
X_train.shape

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, AveragePooling2D, Flatten, Dropout
from tensorflow.python.client import device_lib

model = Sequential([
#     Conv2D(filters=6,
#            kernel_size=(3, 3),
#            activation='relu',
#            input_shape=X_train.shape),
#     Conv2D(filters=16, kernel_size=(3, 3), activation='relu'),
#     AveragePooling2D(),
#     Flatten(),
    Dense(units=1210, activation='relu'),
    Dense(units=841, activation='relu'),
    Dense(units=1, activation='softmax')
])


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_history = model.fit(X_train, y_train, #validation_data=(X_valid_NN, y_valid_NN), 
                          epochs=5, batch_size=32, verbose=True)

In [None]:
import seaborn as sn

plt.subplots(figsize=(30,20))
corrMatrix = X.corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
def histogram_intersection(a, b):
    v = np.minimum(a, b).sum().round(decimals=1)
    return v

X.corr(method=histogram_intersection)

## Feature Selection - Chi Squared

In [None]:
X_new = SelectKBest(chi2, k=8).fit_transform(X, y)
X_new.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.33, random_state=42)

### XGBoost

In [None]:
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train, y_train)

In [None]:
y_pred = xgboost_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=['benign', 'malicious'], digits=4))