# Dataset Surtur

In [1]:
import pandas as pd 
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500


import numpy as np
from glob import glob as globlin
from bs4 import BeautifulSoup
import re
from urlextract import URLExtract
import statistics
import os
import csv
import onemillion
import socket

import io
from imageio import imread

import geoip2.database

from tqdm import tqdm
tqdm.pandas()

import signal
from contextlib import contextmanager

from xgboost import XGBClassifier

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

import matplotlib.pyplot as plt

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot
import plotly.express as px

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

import cv2
import base64

@contextmanager
def timeout(time):
    # Register a function to raise a TimeoutError on the signal.
    signal.signal(signal.SIGALRM, raise_timeout)
    # Schedule the signal to be sent after ``time``.
    signal.alarm(time)

    try:
        yield
    except TimeoutError:
        return 'timeout error'
    finally:
        # Unregister the signal so it won't be triggered
        # if the timeout is not reached.
        signal.signal(signal.SIGALRM, signal.SIG_IGN)


def raise_timeout(signum, frame):
    raise TimeoutError

## Reading New Data

In [15]:
def read_data(main_path):
    """reading the benign samples"""
    dataframe_array = []
    benign_data_paths = globlin(main_path)
    for path in benign_data_paths:
        dataframe_array.append(pd.read_csv(path))
        #print(pd.read_csv(path).columns)
    return pd.concat(dataframe_array)

In [None]:
dataset_surtur = read_data('./data_construction/4 - final_data/*.csv').drop(columns=['Unnamed: 0.1'])
df_mal_sup = read_data('./data_construction/supp_malicious_1.csv')

In [None]:
df_mal_sup = df_mal_sup.drop(columns=['Label'])

In [None]:
dataset_surtur = dataset_surtur.drop(columns=['status', 'https.1'])

In [None]:
df_mal_sup.columns = ['url', 'has_IP_in_url', 'number_subdomains', 'hostname',
       'length_hostname', 'ratio_digits_url', 'having_@_in_url',
       'ratio_digits_hostname', 'number_underscores', 'tld', 'url_len',
       'https', 'who_is', 'content', 'label']

df_mal_sup.columns

In [None]:
dataset_surtur = pd.concat([dataset_surtur, df_mal_sup], axis=0, sort=False).drop_duplicates()
dataset_surtur.head()

In [None]:
dataset_surtur['label'].value_counts()

## Extracting JS

In [None]:
def get_js(content, supp=False):
    tags_of_interest = [
        '<script type="text/javascript">', 
        '<script>'
    ]
    
    soup = BeautifulSoup(content, 'html.parser')
    js = soup.find_all('script')
    complete_js = ''
    for tag in js:
        for tag_int in tags_of_interest:
            if tag_int in str(tag):
                complete_js += str(tag).replace(tag_int, '').replace(
                    '</script>', '')
    return complete_js.encode('utf-8')

In [None]:
dataset_surtur['js'] = dataset_surtur['content'].progress_apply(lambda content: get_js(str(content)))

## JS Length

In [None]:
dataset_surtur['js_len'] = dataset_surtur['js'].progress_apply(lambda js: len(js))

In [None]:
ex_string_thing = dataset_surtur[dataset_surtur['js_len']==0].iloc[1]['content']

## Counting Script tag references in html page 

In [None]:
def script_references(js):
    return js.count('<script')/2

In [None]:
dataset_surtur['js_ref'] = dataset_surtur['content'].progress_apply(
    lambda content: script_references(str(content)))

## Getting array lengths from JS 

In [None]:
def get_max_array_length(js):
    array_lengths = re.findall('\(([^\)]+)\)', js)
    if array_lengths == []:
        return 0
    return max([len(i) for i in array_lengths])

def get_avg_array_length(js):
    array_lengths = re.findall('\(([^\)]+)\)', js)
    if array_lengths == []:
        return 0
    return statistics.mean([len(i) for i in array_lengths])

In [None]:
dataset_surtur['js_array_len_avg'] = dataset_surtur['js'].progress_apply(
    lambda js: get_avg_array_length(str(js)))
dataset_surtur['js_array_len_max'] = dataset_surtur['js'].progress_apply(
    lambda js: get_max_array_length(str(js)))

## Content Length

In [None]:
dataset_surtur['content_len'] = dataset_surtur['content'].progress_apply(
    lambda x: len(str(x)))

## Number of Function calls

In [None]:
dataset_surtur['js'][1]

In [None]:
def get_func_calls(js):
    full_paren = len(re.findall("\(([^\)]+)\)", js))
    empty_paren = len(js.split('()'))
    return full_paren + empty_paren

In [None]:
dataset_surtur['num_js_func_calls'] = dataset_surtur['js'].progress_apply(
    lambda x: get_func_calls(str(x)))

## Suspicious Function Count

In [None]:
def get_sus_js_function_count(js):
    function_list = [
        'setcookie', 'getcookie', 'createxmlhttprequest', 'unescape',
        'document.write', 'element.appendchild', 'dateobject.togmtstring',
        'new activexobject', 'document.createelement', 'getappname',
        'getuseragent', 'window.setinterval', 'window.settimeout',
        'location.assign', 'location.replace', 'eval()', 'string.indexof',
        'string.fromcharcode', 'charat', 'split',
        'string.charcodeat', 'document.writeln', 'document.appendchild',
        'innerhtml', 'insertAdjacentHTML', 'outerhtml', 
    ]

    split_js = js.replace('\n',' ').split(' ')
    counter = 0
    for element in split_js:
        if any(m_function in element.lower() for m_function in function_list):
            counter += 1

    return counter

In [None]:
dataset_surtur['malicious_func_count'] = dataset_surtur['js'].progress_apply(
    lambda js: get_sus_js_function_count(str(js)))

## Get total and external URL count in content

In [None]:
# def find_urls(string, ext_count):
#     # with timeout(1):
#     regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
#     url = re.findall(regex, string)
#     if ext_count:
#         return len(set(url))
#     return len(url)


# # Test Code
# string = 'My Profile: https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles in the portal of http://www.geeksforgeeks.org/'
# print("Urls: ", find_urls(string, False))

In [None]:
def find_urls(string, ext_count):
    try:
        extractor = URLExtract()
        urls = extractor.find_urls(string)
        if ext_count: 
            return len(set(urls))
        return len(urls)
    except:
        return 10000

string_ex = 'My Profile: https://auth.geeksforgeeks.org/user/Chinmoy%20Lenka/articles in the portal of http://www.geeksforgeeks.org/'
    
print(find_urls(string_ex, False))

In [None]:
dataset_surtur['total_url_count'] = dataset_surtur['content'].progress_apply(
    lambda js: find_urls(str(js), False)
)

In [None]:
dataset_surtur['ext_url_count'] = dataset_surtur['content'].progress_apply(
    lambda js: find_urls(str(js), True)
)

In [None]:
dataset_surtur.columns

## More Lexical Features

In [None]:
dataset_surtur['num_semicolons'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count(';'))
dataset_surtur['num_zeros'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('0'))
dataset_surtur['num_spaces'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('%20'))
dataset_surtur['num_hyphens'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('-'))
dataset_surtur['num_@s'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('@'))
dataset_surtur['num_queries'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('?'))
dataset_surtur['num_ampersands'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('&'))
dataset_surtur['num_equals'] = dataset_surtur['url'].progress_apply(lambda url: str(url).count('='))

## Safe Browsing API Judgement

In [None]:
with open('./safebrowsingkey.txt') as f:
    api_key = f.read()

In [None]:
import requests

key = 'your key here'
URL = "https://sb-ssl.google.com/safebrowsing/api/lookup?client=api&apikey={key}&appver=1.0&pver=3.0&url={url}"


def is_safe(url):
    try:
        response = requests.get(URL.format(key=api_key, url=url))
        return response.text != 'malware'
    except:
        return 'unknown'


print(is_safe('http://addonrock.ru/Debugger.js/'))  # prints False
print(is_safe('http://google.com'))  # prints True


dataset_surtur['google_is_safe'] = dataset_surtur['url'].progress_apply(lambda url: is_safe(url))

## Retrieving Domain names and their lengths

In [None]:
dataset_surtur['domain'] = dataset_surtur['hostname'].progress_apply(
    lambda hostname: '.'.join(str(hostname).split('.')[-2:]))

In [None]:
dataset_surtur['domain_len'] = dataset_surtur['domain'].progress_apply(lambda domain: len(domain))

## Presence in Alexa domains

In [None]:
CONFIG = {
    'domain_lists': [
        {
            'name': "alexa",
            'output_file_path': "alexa.csv",
            'url': "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
        }, {
            'name': "cisco umbrella",
            'output_file_path': "cisco.csv",
            'url': "http://s3-us-west-1.amazonaws.com/umbrella-static/" +
                   "top-1m.csv.zip"
        }
    ]
}

# Cache our top 1 million known domains
DEFAULT_CACHE_LOCATION = '~/.onemillion'
cache_location = os.path.expanduser(DEFAULT_CACHE_LOCATION)

# O(n) to read Csv file rows into set 
def read_onemillion_data():
    """Check if the given domain is in a top on million list."""
    # TODO: parse the registered domain out of the domain parameter

    # keep track of the highest (nearest to 1) rank for the given domain
    highest_rank = None

    # see if the given domain is in the up-to-date domain lists
    domains = set()
    for domain_list in CONFIG['domain_lists']:
        # open the domain list as a CSV
        with open(os.path.join(cache_location, domain_list['output_file_path']), 'r') as domain_csv:
            domain_reader = csv.reader(domain_csv)
            for row in domain_reader:
                domains.add(row[1])
    return domains


domains_onemillion = read_onemillion_data()

# O(1) Function Run time
def domain_checker(domain):
    domain = domain.lower()
    if domain in domains:
        return True
    return False

In [None]:

o = onemillion.OneMillion()

def check_if_in_onemillion(domain):
    if domain in domains_onemillion:
        return 1
    else:
        return 0
    
    
dataset_surtur['is_in_alexa'] = dataset_surtur['domain'].progress_apply(
    lambda domain: check_if_in_onemillion(domain))    

## Get IP address

In [None]:
dataset_surtur.iloc[0]['url']

In [None]:
dataset_surtur.iloc[2]['url'].split('/')[2]

In [None]:
socket.gethostbyname(dataset_surtur['url'].iloc[28].split('/')[2])

In [None]:
dataset_surtur['url'].iloc[27].split('/')[2]

In [None]:
def get_ip_url(url):
    try:
        domain = url.split('/')[2]
        return socket.gethostbyname(domain)
    except:
        return 'unknown'

dataset_surtur['ip_address'] = dataset_surtur['url'].progress_apply(
    lambda url: get_ip_url(url))

In [None]:
dataset_surtur['ip_address'].value_counts()

In [None]:
len(dataset_surtur[dataset_surtur['ip_address'] == 'unknown'])

## Get location of each site

In [None]:
reader = geoip2.database.Reader('./GeoLite2-Country.mmdb')

def get_location(ip_add):
    try:
        if ip_add == 'unknown':
            return 'unknown'
        response = reader.country(ip_add)
        return response.country.name
    except Exception as msg:
        return 'unknown'

dataset_surtur['location'] = dataset_surtur['ip_address'].progress_apply(
    lambda ip_add: get_location(ip_add))

reader.close()
#df

## One hot encoding

In [None]:
def column_adjustor(dataset_column):
    unique_values = dataset_column.unique()
    return dataset_column.progress_apply(lambda x: np.where(unique_values == x)[0][0])

In [None]:
dataset_surtur = dataset_surtur.dropna()

In [None]:
dataset_surtur['who_is'] = column_adjustor(dataset_surtur['who_is'])
dataset_surtur['https'] = column_adjustor(dataset_surtur['https'])
dataset_surtur['tld'] = column_adjustor(dataset_surtur['tld'])
dataset_surtur['google_is_safe'] = column_adjustor(dataset_surtur['google_is_safe'])
dataset_surtur['location'] = column_adjustor(dataset_surtur['location'])


dataset_surtur['label'] = dataset_surtur['label'].progress_apply(
    lambda label: 1 if 'bad' in label else 0)

In [None]:
dataset_surtur

# Checkpoint 

In [None]:
# dataset_surtur.to_csv('./dataset_surtur.csv', index=False)

In [None]:
dataset_surtur = pd.read_csv('./dataset_surtur.csv')
dataset_surtur.columns

In [None]:
df_surtur_alpha = pd.read_csv('./dataset_surtur_alpha.csv')
df_surtur_alpha.head()

In [None]:
dataset_surtur['label'].value_counts()

In [None]:
len(dataset_surtur)

## Loading images

In [None]:
# import base64
# import io
# import cv2
# from imageio import imread
# import matplotlib.pyplot as plt

# # reconstruct image as an numpy array
# img = imread(io.BytesIO(base64.b64decode(img_string)))

# # show image
# plt.figure()
# plt.imshow(img, cmap="gray")

# # finally convert RGB image to BGR for opencv
# # and save result
# cv2_img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

In [None]:
imgs_for_org_index = globlin('./img_extract/dataset_surtur_images_original_index_insert/*.png')
imgs_for_mal_separate = globlin('./img_extract/dataset_surt_images_malicious/*.png')
imgs_for_ben_separate = globlin('./img_extract/dataset_surtur_images_unique_index (when isolating benign)/*.png')

In [None]:
df_imgs = np.zeros(len(df_surtur_alpha)).astype('str')

pbar = tqdm(total=len(imgs_for_org_index) + len(imgs_for_ben_separate) + len(imgs_for_mal_separate)) 

def get_saved_index(path):
    return int(path.split('/')[-1].replace('_0.png', '').replace('_1.png', ''))

def load_as_base64(path):
    with open(path, 'rb') as f:
        img_string = base64.b64encode(f.read()).decode()
    return img_string

### INSERTING IMAGES WITH NO ALTERED INDEXES
for path in imgs_for_org_index:
    img_index = get_saved_index(path)
    base64_image = load_as_base64(path)
    df_imgs[img_index] = base64_image
    
    pbar.update(1)
    
df_surtur_alpha['image'] = df_imgs

df_surtur_alpha_malicious = df_surtur_alpha[df_surtur_alpha['label'] == 1].reset_index()
df_surtur_alpha_benign = df_surtur_alpha[df_surtur_alpha['label'] == 0].reset_index()

df_imgs_mal = np.array(df_surtur_alpha_malicious['image'])
df_imgs_ben = np.array(df_surtur_alpha_benign['image'])

for path in imgs_for_mal_separate:
    img_index = get_saved_index(path)
    base64_image = load_as_base64(path)
    df_imgs_mal[img_index] = base64_image
    pbar.update(1)
    
for path in imgs_for_ben_separate:
    img_index = get_saved_index(path)
    base64_image = load_as_base64(path)
    df_imgs_ben[img_index] = base64_image
    pbar.update(1)
    
df_surtur_alpha_malicious['image'] = df_imgs_mal
df_surtur_alpha_benign['image'] = df_imgs_ben


df_surtur_alpha = pd.concat([df_surtur_alpha_malicious, df_surtur_alpha_benign], axis=0)

In [None]:
len(df_surtur_alpha[df_surtur_alpha['image']=='0.0'])

In [None]:
df_surtur_alpha_adjusted = df_surtur_alpha[df_surtur_alpha['url'].isin(
    dataset_surtur['url'].values.tolist())].sort_values('url')

In [None]:
dataset_surtur = dataset_surtur.sort_values('url')

In [None]:
df_surtur_alpha_adjusted['google_is_safe'] = dataset_surtur['google_is_safe'].values.tolist()
df_surtur_alpha_adjusted['ip_address'] = dataset_surtur['ip_address'].values.tolist()
df_surtur_alpha_adjusted['location'] = dataset_surtur['location'].values.tolist()

## Checkpoint 2

In [None]:
# df_surtur_alpha_adjusted.to_csv('./dataset_surtur_2_w_images.csv', index=False)

In [2]:
dataset_surtur = pd.read_csv('./dataset_surtur_2_w_images.csv')
dataset_surtur.head()

Unnamed: 0,index,content,has_IP_in_url,having_@_in_url,hostname,https,label,length_hostname,number_subdomains,number_underscores,ratio_digits_hostname,ratio_digits_url,tld,url,url_len,who_is,js,js_len,js_ref,js_array_len_avg,js_array_len_max,content_len,num_js_func_calls,malicious_func_count,total_url_count,ext_url_count,num_semicolons,num_zeros,num_spaces,num_hyphens,num_@s,num_queries,num_ampersands,num_equals,domain,domain_len,image,google_is_safe,ip_address,location
0,49707,"<html><head><meta content=""no-cache"" http-equi...",0.0,0.0,00.124.324.77.00.opteamevent.hu,0,1,31.0,6.0,0.0,0.387097,0.307692,42,http://00.124.324.77.00.opteamevent.hu/,39.0,0,b'',0,0.0,0.0,0,165,1,0,0,0,0,4,0,0,0,0,0,0,opteamevent.hu,14,iVBORw0KGgoAAAANSUhEUgAABVYAAAKZCAYAAABTIHD9AA...,0,91.82.220.51,35
1,48436,�MQTH1TDa1��fl�hF1bVOFESLCnBbRI9MRTH1PDa1SP...,0.0,0.0,00005ik.rcomhost.com,0,1,20.0,1.0,0.0,0.25,0.212121,0,http://00005ik.rcomhost.com/7fg3g,33.0,0,b'',0,0.0,0.0,0,93,1,0,0,0,0,4,0,0,0,0,0,0,rcomhost.com,12,0.0,0,206.188.193.86,0
2,97761,"<!DOCTYPE html>\n<html data-adblockkey=""MFwwDQ...",0.0,0.0,000098.ihostfull.com,0,1,20.0,1.0,0.0,0.3,0.214286,0,http://000098.ihostfull.com/,28.0,0,"b'g_pb=(function(){var\nDT=document,azx=locati...",3301,0.5,22.245902,61,4092,89,2,1,1,0,4,0,0,0,0,0,0,ihostfull.com,13,iVBORw0KGgoAAAANSUhEUgAABUUAAAKuCAYAAACGxn0DAA...,0,199.59.242.153,0
3,103181,"<html>\n<head>\n<meta content=""noarchive"" name...",0.0,0.0,000p6vl.wcomhost.com,0,1,20.0,1.0,0.0,0.2,0.0625,0,http://000p6vl.wcomhost.com/Ameli-Assurance/re...,64.0,0,b'',0,0.0,0.0,0,262,1,0,0,0,0,3,0,1,0,0,0,0,wcomhost.com,12,iVBORw0KGgoAAAANSUhEUgAABVYAAAA7CAYAAACQa9ExAA...,0,208.91.197.27,57
4,47694,"<html><head><meta content=""no-cache"" http-equi...",0.0,0.0,001.002.003.23.opteamevent.hu,0,1,29.0,5.0,0.0,0.37931,0.297297,42,http://001.002.003.23.opteamevent.hu/,37.0,0,b'',0,0.0,0.0,0,165,1,0,0,0,0,6,0,0,0,0,0,0,opteamevent.hu,14,iVBORw0KGgoAAAANSUhEUgAABVYAAAKZCAYAAABTIHD9AA...,0,91.82.220.51,35


## Image Feature Extraction 

In [3]:
def read_image(base64_image):
    img = imread(io.BytesIO(base64.b64decode(base64_image)))
    return cv2.resize(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), (512, 512))

pbar = tqdm(total=len(dataset_surtur))


model = tf.keras.applications.MobileNetV2(include_top=False, 
                                          weights='imagenet', 
                                          input_shape=(512, 512, 3))


images = dataset_surtur['image'].values.tolist()

image_features = []
for base64_image in images:
    try:
        if base64_image == '0.0':
            image_features.append(np.zeros((1, 16, 16, 1280)))
        else:
            cv2_image = read_image(base64_image).reshape([1, 512, 512, 3])
            image_features.append(model.predict(cv2_image))
    except:
        image_features.append(np.zeros((1, 16, 16, 1280)))
    pbar.update(1)


  0%|          | 0/105485 [00:00<?, ?it/s]



100%|██████████| 105485/105485 [1:48:52<00:00, 16.59it/s] 

In [None]:
image_features_2d = pd.DataFrame(np.array(image_features).reshape(len(image_features), 1280*16*16))

100%|██████████| 105485/105485 [1:49:03<00:00, 16.59it/s]

In [None]:
image_feat_new = SelectKBest(chi2, k=20).fit_transform(image_features, dataset_surtur['label'])
image_feat_new = pd.DataFrame(image_feat_new)

feat_array = []
for i in tqdm(range(len(image_feat_new.columns))):
     feat_array.append(f'image_mobnet_{i}')
        
image_feat_new.columns = feat_array
image_feat_new.head()

In [None]:
dataset_surtur_mbnet_feat = pd.concat([dataset_surtur, content_feat_new], axis=1)

In [None]:
dataset_surtur_mbnet_feat.to_csv('./dataset_surtur_image_feats.csv', index=False)

## Checkpoint 3

## URL Feature Extraction 

In [None]:
content_to_change = dataset_surtur['content']
labels_for_content = dataset_surtur['label']

maxlen = 512

tokenizer = Tokenizer(num_words=40000)
tokenizer.fit_on_texts(content_to_change)
content_set = tokenizer.texts_to_sequences(content_to_change)
content_set = pad_sequences(content_set, padding='post', maxlen=maxlen)
# print(len(content_set))

# content_set = np.array(content_set).reshape([len(content_to_change), 75, 75, 3])

# print(np.array(content_set).reshape([11721 100 3 3]))


In [None]:
# content_full_set = []
# for i in tqdm(range(len(content_set))):
#     x = [tf.convert_to_tensor(content_set[i])]
#     x.append(labels_for_content[i])
#     content_full_set.append(x)

In [None]:
max_length = max(dataset_surtur['content_len'])

In [None]:
len(dataset_surtur['content'].iloc[0])

In [None]:
max(dataset_surtur['url_len'])

In [None]:
len(dataset_surtur['url'].iloc[103161])

In [None]:
dataset_surtur['url_len'].nlargest(5)

In [None]:
dataset_surtur['content_len'].nlargest(10)

In [None]:
from transformers import AutoTokenizer, AutoModel, pipeline, BertConfig

config = BertConfig.from_pretrained('bert-base-uncased', 
                                    output_hidden_states=True)
                                   # hidden_size=)

model = AutoModel.from_pretrained('bert-base-uncased', config=config)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased',
                                          add_special_tokens=True,    
                                          truncation=True, 
                                          padding=True, 
                                          return_attention_mask=True, 
                                          return_tensors = "pt")

nlp = pipeline('feature-extraction', model=model, tokenizer=tokenizer)

url_features = tqdm(nlp(dataset_surtur['content'].iloc[33109]), total=len(dataset_surtur))
print(url_features)
features = np.squeeze(url_features)
print(features.shape)

In [None]:
model = tf.keras.applications.MobileNetV2(include_top=False, 
                                          weights='imagenet', 
                                          input_shape=(75, 75, 3))

content_features = []

for index, content in tqdm(enumerate(content_set), total=len(content_set)):
    content = content.reshape([1, 75, 75, 3])
    content_features.append(model.predict(content))

# pred_images = predictions.reshape(images_dataset.shape[0], -1)

In [None]:
np.array(content_features).shape

In [None]:
content_features = pd.DataFrame(np.array(content_features).reshape(len(content_features), 1280*3*3))

In [None]:
# content_features.to_csv('content_features_surtur_2.csv', index=False)

In [None]:
content_features = pd.read_csv('content_features_surtur_2.csv')

In [None]:
content_features.head()

### Chi Square selection of extracted nlp features

In [None]:
content_feat_new = SelectKBest(chi2, k=20).fit_transform(content_features, dataset_surtur['label'])
content_feat_new = pd.DataFrame(content_feat_new)

feat_array = []
for i in tqdm(range(len(content_feat_new.columns))):
     feat_array.append(f'content_mobnet_{i}')
        
content_feat_new.columns = feat_array
content_feat_new.head()

In [None]:
dataset_surtur_mbnet_feat = pd.concat([dataset_surtur, content_feat_new], axis=1)

## Content Feature Extraction 

In [None]:
url_to_change = dataset_surtur['url']
labels_for_url = dataset_surtur['label']

maxlen = 32*32*3

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(url_to_change)
url_set = tokenizer.texts_to_sequences(url_to_change)
url_set = pad_sequences(url_set, padding='post', maxlen=maxlen)
# print(len(url_set))

url_set = np.array(url_set).reshape([len(url_to_change), 32, 32, 3])



In [None]:
url_full_set = []
for i in tqdm(range(len(url_set))):
    x = [tf.convert_to_tensor(url_set[i])]
    x.append(labels_for_url[i])
    url_full_set.append(x)

In [None]:
model = tf.keras.applications.MobileNetV2(include_top=False, 
                                          weights='imagenet', 
                                          input_shape=(32, 32, 3))

url_features = []

for index, url in tqdm(enumerate(url_set), total=len(url_set)):
    url = url.reshape([1, 32, 32, 3])
    url_features.append(model.predict(url))

# pred_images = predictions.reshape(images_dataset.shape[0], -1)

In [None]:
np.array(url_features).shape

In [None]:
url_features = pd.DataFrame(np.array(url_features).reshape(len(url_features), 1280))

In [None]:
# url_features.to_csv('url_features_surtur.csv', index=False)

In [None]:
url_features = pd.read_csv('url_features_surtur.csv')

In [None]:
url_features.head()

In [None]:
url_feat_new = SelectKBest(chi2, k=5).fit_transform(url_features, dataset_surtur['label'])
url_feat_new = pd.DataFrame(url_feat_new)

feat_array = []
for i in tqdm(range(len(url_feat_new.columns))):
     feat_array.append(f'url_mobnet_{i}')
        
url_feat_new.columns = feat_array
url_feat_new.head()

In [None]:
dataset_surtur_ext_feat = pd.concat([dataset_surtur_mbnet_feat, url_feat_new], axis=1)

## Normalizing 

In [None]:
to_keep = dataset_surtur.drop(
    columns=['url', 'content', 'hostname', 'js', 'google_is_safe', 'label']).columns[::-1]

In [None]:
to_keep = dataset_surtur_ext_feat.drop(
    columns=['url', 'content', 'hostname', 'js', 'label', 
             'domain', 'google_is_safe','ip_address']).columns[::-1]
# 'total_url_count', 'ext_url_count'

x = dataset_surtur_ext_feat[dataset_surtur_ext_feat['label'] == 1][to_keep].copy()  #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x.values)
df_to_plot_malicious = pd.DataFrame(x_scaled)
df_to_plot_malicious.columns = dataset_surtur_ext_feat[to_keep].columns
df_to_plot_malicious['label'] = 1.0

x = dataset_surtur_ext_feat[dataset_surtur_ext_feat['label'] == 0][to_keep].copy()  #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x.values)
df_to_plot_benign = pd.DataFrame(x_scaled)
df_to_plot_benign.columns = dataset_surtur_ext_feat[to_keep].columns
df_to_plot_benign['label'] = 0.0

normalized_per_class_dataset = pd.concat([df_to_plot_malicious, df_to_plot_benign], axis=0)

x = dataset_surtur_ext_feat[to_keep].copy()  #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x.values)
dataset_surt_norm = pd.DataFrame(x_scaled)
dataset_surt_norm.columns = dataset_surtur_ext_feat[to_keep].columns
dataset_surt_norm['label'] = dataset_surtur_ext_feat['label'].values

## Outlier removal 

In [None]:
# from scipy import stats

# num_df_surt = dataset_surt_norm.select_dtypes(include=["number"])
# cat_df_surt = dataset_surt_norm.select_dtypes(exclude=["number"])

# # idx = np.all(stats.zscore(num_df_surt) < 3, axis=1)

# # dataset_surt_norm_cleaned = pd.concat([num_df_surt.loc[idx], 
# #                                    cat_df_surt.loc[idx]], axis=1)

# Q1 = num_df_surt.quantile(0.10)
# Q3 = num_df_surt.quantile(0.80)
# IQR = Q3 - Q1
# idx = ~((num_df_surt < (Q1 - 1.5 * IQR)) | (num_df_surt > (Q3 + 1.5 * IQR))).any(axis=1)
# dataset_surt_norm_cleaned = pd.concat([num_df_surt.loc[idx], 
#                                     cat_df_surt.loc[idx]], axis=1)

In [None]:
# len(dataset_surt_norm_cleaned)

In [None]:
# indexes_to_drop = dataset_surtur.nlargest(15000, 'js_len')['js_len'].index
# dataset_surtur = dataset_surtur.drop(indexes_to_drop, axis='index')

## Feature Analysis

In [None]:
# features = dataset_surtur.drop(columns=['url', 'content', 'hostname', 'js', 
#                                         'label']).columns[::-1] #, 'total_url_count', 'ext_url_count']).columns[::-1]
# df_to_plot = dataset_surtur

# n_bins = 40

# fig, axs = plt.subplots(5, 6, figsize=(20,20))

# # We can set the number of bins with the `bins` kwarg
# feature_counter = 0
# for i in range(len(axs)):
#     for j in range(len(axs[i])): 
#         current_feature = df_to_plot[features[feature_counter]]
#         axs[i, j].hist(current_feature[df_to_plot['label']==0.0], n_bins, fc=(0, 1, 0, 0.5))
#         axs[i, j].hist(current_feature[df_to_plot['label']==1.0], n_bins, fc=(1, 0, 0, 0.5))
#         axs[i, j].set_title(f'Feature: {features[feature_counter]}')
#         if features[feature_counter] == 'asdf':
#             axs[i, j].set_ylim([0, current_feature.value_counts().iloc[0]]) 
#         else:
#             axs[i, j].set_ylim([0, max(current_feature.value_counts())]) 
#         feature_counter += 1
#     if feature_counter > len(features):
#         break

# plt.show()

## Plotly Analysis

In [None]:
dataset_surtur.columns

In [None]:
# js_array_len_avg

In [None]:
feature = 'content_mobnet_1'
# feature = dataset_surt_norm.columns[2]
dataset_to_plot = dataset_surtur_ext_feat

good_filter = dataset_to_plot[feature][dataset_to_plot['label'] == 0]#.progress_apply(lambda x: roundup(x))
bad_filter = dataset_to_plot[feature][dataset_to_plot['label'] == 1]#.progress_apply(lambda x: roundup(x))


# bad_filter = bad_filter[bad_filter!=bad_filter.max()]

trace1 = go.Histogram(
    x=good_filter,
    name='Benign',
    yaxis='y2'

)

trace2 = go.Histogram(
    x=bad_filter,
    name='Malicious',
    yaxis='y2'
)

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(trace1)
fig.add_trace(trace2,secondary_y=True)

fig['layout'].update(height = 700, width = 1000, title = f'Feature: {feature}',xaxis=dict(tickangle=-90))
iplot(fig)

# Machine Learning 

## Standard Set

### Train-test-split

In [None]:
dataset_surtur.columns

In [None]:
X = dataset_surtur_ext_feat.drop(columns = ['label', 'content', 'hostname', 'url', 
                                   'js', 'domain', 'google_is_safe', 'ip_address']).astype('float32')
y = dataset_surtur['label']

# X_new = SelectKBest(chi2, k=30).fit_transform(X, dataset_surtur['label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## t-SNE

In [None]:
# from sklearn.manifold import TSNE

In [None]:
# X_embedded_2D = TSNE(n_components = 2, random_state = 0, n_jobs=-1, init='pca').fit_transform(X)
# X_embedded_3D = TSNE(n_components = 3, random_state = 0, n_jobs=-1, init='pca').fit_transform(X)

In [None]:
# X_embedded_2D.to_csv('./2D_tsne_surtur.csv', index=False)
# X_embedded_3D.to_csv('./3D_tsne_surtur.csv', index=False)

### 2D Plot 

### 3D Plot

In [None]:
# fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z,
#                                    mode='markers')])
# fig.show()

## XGBoost

In [None]:
# xgboost_model = XGBClassifier(verbosity=1, 
#                               max_depth=6,
#                               n_estimators=100,
#                               colsample_bylevel=1, 
#                               num_parallel_tree=1,
#                               learning_rate=0.3,
# #                               tree_method='approx', 
# #                               booster='dart',
#                               n_jobs=-1)

# xgboost_model.fit(X_train, y_train)

In [None]:
xgboost_model = XGBClassifier()

xgboost_model.fit(X_train, y_train)

In [None]:
y_pred = xgboost_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=['benign', 'malicious'], digits=4))

In [None]:
feat_imp = xgboost_model.feature_importances_
feat_dict = {}

for i in range(len(feat_imp)):
    feat_dict[X.columns[i]] = feat_imp[i]
    
feat_dict = {k: v for k, v in sorted(feat_dict.items(), key=lambda item: item[1], reverse=True)}
    
print("{:<25} {:<25}".format('Feature' ,'Importance'))
for k, v in feat_dict.items():
    num = v
    print("{:<25} {:<25}".format(k, num))

In [None]:
1280*3*3

In [None]:
# X.drop(columns=['content_mobnet_0', 'content_mobnet_2', 'malicious_func_count', 
#                 'num_equals', 'total_url_count', 'domain_len', 'num_spaces', 
#                 'content_mobnet_1'])

## Checking misclassified samples

In [None]:
idx_misclassified = []
for index, value in tqdm(enumerate(y_pred), total=len(y_pred)):
    if value != y_test.values[index] and y_test.values[index]==1:
        idx_misclassified.append(index)

In [None]:
X_test.mean() - X_test.iloc[idx_misclassified].mean()

In [None]:
# dataset_surtur.iloc[3]

### Random Forest

In [None]:
rfc_model = RandomForestClassifier(max_depth=2, random_state=0)
rfc_model.fit(X_train, y_train)

In [None]:
y_pred = rfc_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, 
                            target_names=['benign', 'malicious'], 
                            digits=4))

### SVM

In [None]:
param_grid = {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}

svm_model = svm.SVC(verbose=3)
clf = GridSearchCV(svm_model, param_grid, n_jobs=-1, cv=3, verbose=3)

clf.fit(X_train, y_train)

In [None]:
print(clf.best_params_)
print(clf.best_estimator_)

In [None]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, 
                            target_names=['benign', 'malicious'], 
                            digits=4))

## Neural Network

In [None]:
X_train.shape

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, AveragePooling2D, Flatten, Dropout
from tensorflow.python.client import device_lib

model = Sequential([
#     Conv2D(filters=6,
#            kernel_size=(3, 3),
#            activation='relu',
#            input_shape=X_train.shape),
#     Conv2D(filters=16, kernel_size=(3, 3), activation='relu'),
#     AveragePooling2D(),
#     Flatten(),
    Dense(units=22210, activation='relu'),
    Dense(units=11210, activation='relu'),
    Dense(units=1841, activation='relu'),
    Dense(units=1, activation='softmax')
])


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_history = model.fit(X_train, y_train, #validation_data=(X_valid_NN, y_valid_NN), 
                          epochs=5, batch_size=256, verbose=True)

In [None]:
import seaborn as sn

plt.subplots(figsize=(30,20))
corrMatrix = X.corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
# def histogram_intersection(a, b):
#     v = np.minimum(a, b).sum().round(decimals=1)
#     return v

# X.corr(method=histogram_intersection)

## Feature Selection - Chi Squared

In [None]:
X_new = SelectKBest(chi2, k=8).fit_transform(X, y)
X_new.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.33, random_state=42)

### XGBoost

In [None]:
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train, y_train)

In [None]:
y_pred = xgboost_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=['benign', 'malicious'], digits=4))

## Correlation Feature Selection

In [None]:
len(X.columns)

In [None]:
def cor_selector(X, y, num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

cor_support, cor_feature = cor_selector(X, y, 55)
print(str(len(cor_feature)), 'selected features')
print(' ')
print(cor_feature)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[cor_feature], y, test_size=0.33, random_state=42)

In [None]:
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train, y_train)

In [None]:
y_pred = xgboost_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, 
                            target_names=['benign', 'malicious'], 
                            digits=4))