## Import Alexa One Million

In [1]:
import os
import onemillion
import time
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm

pd.options.display.max_rows = 10000
pd.options.display.max_columns = 500

In [2]:
o = onemillion.OneMillion()
print(o.domain_in_million("google.com")) # 1
print(o.domain_in_million("gaagle.com")) # None

1
None


In [3]:
CONFIG = {
    'domain_lists': [
        {
            'name': "alexa",
            'output_file_path': "alexa.csv",
            'url': "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
        }, {
            'name': "cisco umbrella",
            'output_file_path': "cisco.csv",
            'url': "http://s3-us-west-1.amazonaws.com/umbrella-static/" +
                   "top-1m.csv.zip"
        }
    ]
}

# Cache our top 1 million known domains
DEFAULT_CACHE_LOCATION = '~/.onemillion'
cache_location = os.path.expanduser(DEFAULT_CACHE_LOCATION)

# O(n) to read Csv file rows into set 
def read_onemillion_data():
    """Check if the given domain is in a top on million list."""
    # TODO: parse the registered domain out of the domain parameter

    # keep track of the highest (nearest to 1) rank for the given domain
    highest_rank = None

    # see if the given domain is in the up-to-date domain lists
    domains = set()
    for domain_list in CONFIG['domain_lists']:
        # open the domain list as a CSV
        with open(os.path.join(cache_location, domain_list['output_file_path']), 'r') as domain_csv:
            domain_reader = csv.reader(domain_csv)
            for row in domain_reader:
                domains.add(row[1])
    return domains


domains = read_onemillion_data()

# O(1) Function Run time
def domain_checker(domain):
    domain = domain.lower()
    if domain in domains:
        return True
    return False

In [4]:
benign_domains = pd.DataFrame(list(domains))

In [5]:
# benign_domains.to_csv('alexa_domains.csv', index=False)

In [5]:
sample_benign_df = benign_domains.sample(n=40000, replace=False, random_state=41)

In [6]:
sample_benign_df.columns = ['url']
sample_benign_df['label'] = 'good'
sample_benign_df.head()

Unnamed: 0,url,label
1469544,compressioninfo.com,good
777207,nguk1.horizon.vmware.com,good
691198,ngpvanprodstore.blob.core.windows.net,good
979023,r3.sn-aigzrne7.c.2mdn.net,good
440384,engenhariajob.com.br,good


In [7]:
sample_benign_df.to_csv('sample_benign_df.csv', index=False)

## Getting URLS from previous datasets

In [9]:
dataset_chosen_before = pd.read_csv('./data_construction/malicious_urls.csv')

In [10]:
dataset_6 = pd.read_csv('./datasets_for_project/dataset_6.csv')
dataset_6 = dataset_6[~dataset_6['URL'].isin(dataset_chosen_before['url'])]
dataset_6_bad = dataset_6[dataset_6['Label']=='bad'][['URL', 'Label']]
# dataset_6_bad = dataset_6_bad.sample(n=50000, replace=False, random_state=41)
dataset_6_bad.columns = ['url', 'label']
print(len(dataset_6_bad))

0


In [11]:
# dataset_7 = pd.read_excel('./datasets_for_project/dataset_7.xlsx')
# dataset_7 = dataset_7[~dataset_7['URL'].isin(dataset_chosen_before['url'])]
# dataset_7_good = dataset_7[dataset_7['Result']==1][['URL', 'Result']]
# dataset_7_good.columns = ['url', 'label']
# len(dataset_7_good)

In [12]:
dataset_8 = pd.read_csv('./datasets_for_project/dataset_8.csv')
dataset_8 = dataset_8[~dataset_8['domain'].isin(dataset_chosen_before['url'])]
dataset_8_bad = dataset_8[dataset_8['label']==0][['domain', 'label']]
dataset_8_bad.columns = ['url', 'label']
len(dataset_8_bad)

39996

In [14]:
# dataset_9 = pd.read_csv('./datasets_for_project/dataset_9.csv')
# dataset_9_good = dataset_9[dataset_9['status']=='legitimate'][['url', 'status']]
# dataset_9_good.columns = ['url', 'label']
# len(dataset_9_good)

In [14]:
dataset_20_test = pd.read_csv('./datasets_for_project/dataset_20/\
Webpages_Classification_test_data.csv/Webpages_Classification_\
test_data.csv',  engine='python')

dataset_20_train = pd.read_csv('./datasets_for_project/dataset_20/\
Webpages_Classification_train_data.csv/Webpages_Classification_train_data.csv',  
                               engine='python', error_bad_lines=False)

dataset_20 = pd.concat(
    [dataset_20_train, dataset_20_test], 
    axis=0
).drop(columns=['Unnamed: 0'])

dataset_20_bad = dataset_20[dataset_20['label']=='bad'][['url', 'label']]
dataset_20_bad = dataset_20_bad[~dataset_20_bad['url'].isin(dataset_chosen_before['url'])]
dataset_20_bad = dataset_20_bad.sample(frac=1, replace=False, random_state=41)
print(len(dataset_20_bad))

Skipping line 951643: unexpected end of data


5


In [35]:
from glob import glob as globlin

dataframes = []
dataset_23_paths = globlin('./datasets_for_project/dataset_23/*.csv')
for path in dataset_23_paths:
    dataframes.append(pd.read_csv(path).drop(columns=['Unnamed: 0']))
dataset_23 = pd.concat(dataframes, axis=0, sort=True)

In [37]:
dataset_23_bad = dataset_23[dataset_23['label']=='bad'][['url', 'label']]
dataset_23_bad = dataset_23_bad[~dataset_23_bad['url'].isin(dataset_chosen_before['url'])]
dataset_23_bad = dataset_23_bad.sample(frac=1, replace=False, random_state=41)
print(len(dataset_23_bad))

0


In [38]:
dataset_24_legit = pd.read_json('./datasets_for_project/dataset_24/data_legitimate_36400.json')
dataset_24_legit.columns = ['URL']
dataset_24_legit['Label'] = 0

dataset_24_phishing = pd.read_json('./datasets_for_project/dataset_24/data_phishing_37175.json')
dataset_24_phishing.columns = ['URL']
dataset_24_phishing['Label'] = 1

dataset_24 = pd.concat([dataset_24_legit, dataset_24_phishing], axis=0)

In [45]:
dataset_24_bad = dataset_24[dataset_24['Label']==0][['URL', 'Label']]
dataset_24_bad = dataset_24_bad[~dataset_24_bad['URL'].isin(dataset_chosen_before['url'])]
dataset_24_bad = dataset_24_bad.sample(frac=1, replace=False, random_state=41)
dataset_24_bad.columns = ['url', 'label']
print(len(dataset_24_bad))

36400


In [46]:
dataset_24_bad

Unnamed: 0,url,label
23829,http://www.urbandictionary.com/define.php?term...,0
34943,http://www.mycablemart.com/help/hdmi_which_one...,0
14678,https://www.wellsfargo.com/help/faqs/activity-...,0
31541,https://ru.wikipedia.org/wiki/Wake-on-LAN,0
18131,http://ecomputernotes.com/computernetworkingno...,0
...,...,...
27064,http://www.dbsalliance.org/site/PageServer?pag...,0
31597,http://www.thefullwiki.org/MultiMediaCard,0
20450,http://www.webopedia.com/TERM/L/local_area_net...,0
931,http://www.dictionary.com/browse/iconic,0


In [47]:
dataset_15_1 = pd.read_csv('./datasets_for_project/dataset_15/data.csv')
dataset_15_2 = pd.read_csv('./datasets_for_project/dataset_15/data2.csv', header=None)
dataset_15_2.columns = ['url', 'label']
dataset_15 = pd.concat([dataset_15_1, dataset_15_2], axis=0)

In [48]:
dataset_15

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad
...,...,...
32871,23.227.196.215/,bad
32872,apple-checker.org/,bad
32873,apple-iclods.org/,bad
32874,apple-uptoday.org/,bad


In [49]:
dataset_15_bad = dataset_15[dataset_15['label']=='bad'][['url', 'label']]
dataset_15_bad = dataset_15_bad[~dataset_15_bad['url'].isin(dataset_chosen_before['url'])]
dataset_15_bad = dataset_15_bad.sample(frac=1, replace=False, random_state=41)
print(len(dataset_15_bad))

0


In [50]:
dataset_bad = pd.concat([dataset_24_bad,
                          dataset_8_bad], axis=0).drop_duplicates()

In [51]:
len(dataset_bad)

60456

In [54]:
dataset_bad

Unnamed: 0,url,label
23829,http://www.urbandictionary.com/define.php?term...,bad
34943,http://www.mycablemart.com/help/hdmi_which_one...,bad
14678,https://www.wellsfargo.com/help/faqs/activity-...,bad
31541,https://ru.wikipedia.org/wiki/Wake-on-LAN,bad
18131,http://ecomputernotes.com/computernetworkingno...,bad
...,...,...
95899,www.geocities.com/SiliconValley/Lab/7378/fisch...,bad
95901,sourceforge.net/projects/froofyjit/,bad
95902,www.angelfire.com/tx4/PolkLonghorns/,bad
95903,www.freewebs.com/aylesburyscrabble/index.htm,bad


In [56]:
# dataset_bad.to_csv('./data_construction/malicious_urls_2.csv')