# Data Research 

In [1]:
import pandas as pd 
import numpy as np

# Requirements 
# https://github.com/prasmussen/gdrive
# !pip install gdown

pd.options.display.max_columns = 500

## Downloading all the datasets from public drive

The datasets were collected and stored publicly here for easier access. Using the gdown cmdline interface
to download everything we need

https://drive.google.com/drive/u/0/folders/1cuJTiTcLfCE7VI7DEGJNE2fNWct4qbVg

In [8]:
!gdown https://drive.google.com/uc?id=1RGwoiYsHvlbBzvjbpOsrQ47_B9NkmvoG

Downloading...
From: https://drive.google.com/uc?id=1RGwoiYsHvlbBzvjbpOsrQ47_B9NkmvoG
To: /Users/ahmadchaiban/Desktop/Arctic Wolf/Malicious-Phishing-Detection/datasets_for_project.zip
1.86GB [15:54, 1.95MB/s]


In [17]:
import zipfile
with zipfile.ZipFile('datasets_for_project.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

## Just a function to make life easier

Extracts some details on most datasets, some exceptional datasets are handled manually

In [72]:
def info_extract(dataset, label_string, malicious=1, benign=0, optional_value=None, raw_data=False):
    print("######### FEATURES #########")
    print(dataset.columns)
    print('\n')
    print("######### Dataset details #########")
    
    if raw_data == False:
        print('No raw data')
    else:
        print('Contains raw data')
        
    print(f"Total number of samples = {len(dataset)}")
    
    if optional_value is not None:
        print(f"Malicious Samples = {len(dataset[dataset[label_string]==malicious])}")
        print(f"Suspicious Samples = {len(dataset[dataset[label_string]==optional_value])}")
        print(f"Benign Samples = {len(dataset[dataset[label_string]==benign])}")
        return dataset.head()
    print(f"Malicious Samples = {len(dataset[dataset[label_string]==malicious])}")
    print(f"Benign Samples = {len(dataset[dataset[label_string]==benign])}")
    return dataset.head()

## Dataset 1
https://data.mendeley.com/datasets/h3cgnj8hft/1

### Citations and uses



In [68]:
from scipy.io import arff

data = arff.loadarff('./datasets_for_project/dataset_1.arff')
dataset_1 = pd.DataFrame(data[0])

In [69]:
info_extract(dataset_1, 'CLASS_LABEL', b'1', b'0')

######### FEATURES #########
Index(['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
       'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore',
       'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash',
       'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress',
       'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname',
       'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath',
       'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks',
       'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms',
       'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction',
       'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch',
       'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow',
       'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle',
       'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT',
       'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 'ExtMetaScriptLinkRT',
       'PctEx

Unnamed: 0,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,NumQueryComponents,NumAmpersand,NumHash,NumNumericChars,NoHttps,RandomString,IpAddress,DomainInSubdomains,DomainInPaths,HttpsInHostname,HostnameLength,PathLength,QueryLength,DoubleSlashInPath,NumSensitiveWords,EmbeddedBrandName,PctExtHyperlinks,PctExtResourceUrls,ExtFavicon,InsecureForms,RelativeFormAction,ExtFormAction,AbnormalFormAction,PctNullSelfRedirectHyperlinks,FrequentDomainNameMismatch,FakeLinkInStatusBar,RightClickDisabled,PopUpWindow,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,3.0,1.0,5.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,21.0,44.0,0.0,0.0,0.0,0.0,0.0,0.25,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,-1.0,1.0,b'1'
1,3.0,1.0,3.0,144.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,1.0,0.0,41.0,1.0,0.0,0.0,0.0,0.0,0.0,17.0,16.0,103.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,1.0,1.0,1.0,1.0,b'1'
2,3.0,1.0,2.0,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,27.0,24.0,0.0,0.0,0.0,0.0,0.375,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,1.0,-1.0,0.0,b'1'
3,3.0,1.0,6.0,79.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,22.0,50.0,0.0,0.0,0.0,1.0,1.0,0.095238,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.0,1.0,1.0,1.0,-1.0,b'1'
4,3.0,0.0,4.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,10.0,29.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,-1.0,0.0,-1.0,-1.0,b'1'


## Dataset 2 

https://www.kaggle.com/xwolf12/malicious-and-benign-websites
https://github.com/urcuqui/WhiteHat/tree/master/Research/Web%20security

### Citations and uses

In [70]:
dataset_2 = pd.read_csv('./datasets_for_project/dataset_2.csv')
info_extract(dataset_2, 'Type', 1, 0)

######### FEATURES #########
Index(['URL', 'URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CHARSET', 'SERVER',
       'CONTENT_LENGTH', 'WHOIS_COUNTRY', 'WHOIS_STATEPRO', 'WHOIS_REGDATE',
       'WHOIS_UPDATED_DATE', 'TCP_CONVERSATION_EXCHANGE',
       'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS', 'APP_BYTES', 'SOURCE_APP_PACKETS',
       'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES',
       'APP_PACKETS', 'DNS_QUERY_TIMES', 'Type'],
      dtype='object')


######### Dataset details #########
No raw data
Total number of samples = 1781
Malicious Samples = 216
Benign Samples = 1565


Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATEPRO,WHOIS_REGDATE,WHOIS_UPDATED_DATE,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
0,M0_109,16,7,iso-8859-1,nginx,263.0,,,10/10/2015 18:21,,7,0,2,700,9,10,1153,832,9,2.0,1
1,B0_2314,16,6,UTF-8,Apache/2.4.10,15087.0,,,,,17,7,4,1230,17,19,1265,1230,17,0.0,0
2,B0_911,16,6,us-ascii,Microsoft-HTTPAPI/2.0,324.0,,,,,0,0,0,0,0,0,0,0,0,0.0,0
3,B0_113,17,6,ISO-8859-1,nginx,162.0,US,AK,7/10/1997 4:00,12/09/2013 0:45,31,22,3,3812,39,37,18784,4380,39,8.0,0
4,B0_403,17,6,UTF-8,,124140.0,US,TX,12/05/1996 0:00,11/04/2017 0:00,57,2,5,4278,61,62,129889,4586,61,4.0,0


## Dataset 3

https://www.kaggle.com/ahmednour/website-phishing-data-set

### Citations and uses

In [73]:
dataset_3 = pd.read_csv('./datasets_for_project/dataset_3.csv')
info_extract(dataset_3, 'Result', -1, 1, 0)

######### FEATURES #########
Index(['SFH', 'popUpWidnow', 'SSLfinal_State', 'Request_URL', 'URL_of_Anchor',
       'web_traffic', 'URL_Length', 'age_of_domain', 'having_IP_Address',
       'Result'],
      dtype='object')


######### Dataset details #########
No raw data
Total number of samples = 1353
Malicious Samples = 702
Suspicious Samples = 103
Benign Samples = 548


Unnamed: 0,SFH,popUpWidnow,SSLfinal_State,Request_URL,URL_of_Anchor,web_traffic,URL_Length,age_of_domain,having_IP_Address,Result
0,1,-1,1,-1,-1,1,1,1,0,0
1,-1,-1,-1,-1,-1,0,1,1,1,1
2,1,-1,0,0,-1,0,-1,1,0,1
3,1,0,1,-1,-1,0,1,1,0,0
4,-1,-1,1,-1,0,0,-1,1,0,1


## Dataset 4

https://www.kaggle.com/eswarchandt/phishing-website-detector?select=phishing.csv

### Citations and uses

In [74]:
dataset_4 = pd.read_csv('./datasets_for_project/dataset_4.csv')
info_extract(dataset_4, 'class', -1, 1)

######### FEATURES #########
Index(['Index', 'UsingIP', 'LongURL', 'ShortURL', 'Symbol@', 'Redirecting//',
       'PrefixSuffix-', 'SubDomains', 'HTTPS', 'DomainRegLen', 'Favicon',
       'NonStdPort', 'HTTPSDomainURL', 'RequestURL', 'AnchorURL',
       'LinksInScriptTags', 'ServerFormHandler', 'InfoEmail', 'AbnormalURL',
       'WebsiteForwarding', 'StatusBarCust', 'DisableRightClick',
       'UsingPopupWindow', 'IframeRedirection', 'AgeofDomain', 'DNSRecording',
       'WebsiteTraffic', 'PageRank', 'GoogleIndex', 'LinksPointingToPage',
       'StatsReport', 'class'],
      dtype='object')


######### Dataset details #########
No raw data
Total number of samples = 11054
Malicious Samples = 4897
Benign Samples = 6157


Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,Favicon,NonStdPort,HTTPSDomainURL,RequestURL,AnchorURL,LinksInScriptTags,ServerFormHandler,InfoEmail,AbnormalURL,WebsiteForwarding,StatusBarCust,DisableRightClick,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,1,1,-1,1,0,-1,-1,1,1,0,1,1,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,1,1,-1,1,0,-1,-1,-1,-1,0,1,1,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,1,1,-1,-1,0,0,-1,1,1,0,1,1,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,1,1,1,1,0,0,-1,1,1,0,-1,1,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,1,1,-1,1,0,0,-1,-1,-1,0,1,1,1,1,1,1,1,-1,1,-1,-1,1


## Dataset 5

https://www.kaggle.com/akashkr/phishing-website-dataset

### Citations and uses

In [75]:
dataset_5 = pd.read_csv('./datasets_for_project/dataset_5.csv')
info_extract(dataset_5, 'Result', -1, 1)

######### FEATURES #########
Index(['index', 'having_IPhaving_IP_Address', 'URLURL_Length',
       'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting',
       'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State',
       'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token',
       'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH',
       'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover',
       'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord',
       'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page',
       'Statistical_report', 'Result'],
      dtype='object')


######### Dataset details #########
No raw data
Total number of samples = 11055
Malicious Samples = 4898
Benign Samples = 6157


Unnamed: 0,index,having_IPhaving_IP_Address,URLURL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,port,HTTPS_token,Request_URL,URL_of_Anchor,Links_in_tags,SFH,Submitting_to_email,Abnormal_URL,Redirect,on_mouseover,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,-1,1,-1,1,-1,-1,-1,0,1,1,1,1,-1,-1,-1,-1,1,1,-1,-1
1,2,1,1,1,1,1,-1,0,1,-1,1,1,-1,1,0,-1,-1,1,1,0,1,1,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,1,1,-1,1,0,-1,-1,-1,-1,0,1,1,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,1,1,-1,-1,0,0,-1,1,1,0,1,1,1,1,-1,-1,1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,1,1,-1,1,1,1,1,0,0,-1,1,1,0,-1,1,-1,1,-1,-1,0,-1,1,1,1,1


## Dataset 6

https://www.kaggle.com/taruntiwarihp/phishing-site-urls

### Citations and uses

In [79]:
dataset_6 = pd.read_csv('./datasets_for_project/dataset_6.csv')
info_extract(dataset_6, 'Label', 'bad', 'good', raw_data=True)

######### FEATURES #########
Index(['URL', 'Label'], dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 549346
Malicious Samples = 156422
Benign Samples = 392924


Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


## Dataset 7

https://www.kaggle.com/akshaya1508/phishing-websites-detection

### Citations and uses

In [78]:
dataset_7 = pd.read_excel('./datasets_for_project/dataset_7.xlsx')
info_extract(dataset_7, 'Result', -1, 1, raw_data=True)

######### FEATURES #########
Index(['URL', 'Having @ Symbol', 'Presence of IP Address', 'Length of URL',
       'No. of Slashes', 'Special Character', 'No.of Dots',
       'No. of Hyphen in Host Address', '"Email" Keyword', 'TLS ',
       'Age of URL', 'Result'],
      dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 1400
Malicious Samples = 900
Benign Samples = 500


Unnamed: 0,URL,Having @ Symbol,Presence of IP Address,Length of URL,No. of Slashes,Special Character,No.of Dots,No. of Hyphen in Host Address,"""Email"" Keyword",TLS,Age of URL,Result
0,https://pogodasub.com/2Ge6I0rd/4p0OksK6-inteli...,1,1,1,-1,-1,-1,1,1,-1,-1,-1
1,https://pogodasub.com/2Ge6I0rd/Ly6I98Se-ipko/l...,1,1,1,-1,-1,-1,1,1,-1,-1,-1
2,https://pogodasub.com/2Ge6I0rd/hz6hZnA,1,1,1,-1,1,1,1,1,-1,-1,-1
3,https://resicorphome.com/brian/srr/austin/,1,1,1,-1,1,1,1,1,-1,-1,-1
4,https://kurier24-oplata.com/PzVxLK7l/WdJslUDi-...,1,1,1,-1,-1,-1,1,1,-1,-1,-1


## Dataset 8

https://www.kaggle.com/aman9d/phishing-data

### Citations and uses 

In [81]:
dataset_8 = pd.read_csv('./datasets_for_project/dataset_8.csv')
info_extract(dataset_8, 'label', 1, 0, raw_data=True)

######### FEATURES #########
Index(['domain', 'ranking', 'isIp', 'valid', 'activeDuration', 'urlLen', 'is@',
       'isredirect', 'haveDash', 'domainLen', 'nosOfSubdomain', 'label'],
      dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 95910
Malicious Samples = 55914
Benign Samples = 39996


Unnamed: 0,domain,ranking,isIp,valid,activeDuration,urlLen,is@,isredirect,haveDash,domainLen,nosOfSubdomain,label
0,www.voting-yahoo.com,10000000,0,0,0,20,0,0,1,20,2,1
1,www.zvon.org/xxl/WSDL1.1/Output/index.html,194914,0,1,7305,42,0,0,0,12,2,0
2,tecportais.com/file-security-update-infonfmati...,10000000,0,0,0,155,0,0,0,14,1,1
3,bima.astro.umd.edu/nemo/linuxastro/,7001,0,0,0,35,0,0,0,18,3,0
4,huarui-tec.com/js/?us.battle.net/login/en/?ref...,10000000,0,1,730,79,0,0,1,14,1,1


## Dataset 9

https://www.kaggle.com/manishkc06/web-page-phishing-detection

In [83]:
dataset_9 = pd.read_csv('./datasets_for_project/dataset_9.csv')
info_extract(dataset_9, 'status', 'phishing', 'legitimate', raw_data=True)

######### FEATURES #########
Index(['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore',
       'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'ratio_digits_host', 'punycode', 'port', 'tld_in_path',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',


Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,nb_underscore,nb_tilde,nb_percent,nb_slash,nb_star,nb_colon,nb_comma,nb_semicolumn,nb_dollar,nb_space,nb_www,nb_com,nb_dslash,http_in_path,https_token,ratio_digits_url,ratio_digits_host,punycode,port,tld_in_path,tld_in_subdomain,abnormal_subdomain,nb_subdomains,prefix_suffix,random_domain,shortening_service,path_extension,nb_redirection,nb_external_redirection,length_words_raw,char_repeat,shortest_words_raw,shortest_word_host,shortest_word_path,longest_words_raw,longest_word_host,longest_word_path,avg_words_raw,avg_word_host,avg_word_path,phish_hints,domain_in_brand,brand_in_subdomain,brand_in_path,suspecious_tld,statistical_report,nb_hyperlinks,ratio_intHyperlinks,ratio_extHyperlinks,ratio_nullHyperlinks,nb_extCSS,ratio_intRedirection,ratio_extRedirection,ratio_intErrors,ratio_extErrors,login_form,external_favicon,links_in_tags,submit_email,ratio_intMedia,ratio_extMedia,sfh,iframe,popup_window,safe_anchor,onmouseover,right_clic,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.progarchives.com/album.asp?id=61737,46,20,zero,3,zero,0,1,0,0,1,0,0,0,3,0,1,0,0,0,0,1,0,0,0,1,0.108696,0.0,0,0,0,0,0,3,0,0,0,0,0,0,6,3,2,3,2,12,12,5,5.0,7.5,3.75,0,0,0,0,0,0,143,0.93007,0.06993,0,1,0,0.0,0,0.0,0,1,73.913043,0,100.0,0.0,0,0,0,77.777778,0,0,0,1,one,0,627,6678,78526,0,0,5,phishing
1,http://signin.eday.co.uk.ws.edayisapi.dllsign....,128,120,0,10,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,0,0,0,0,0,1,0.054688,0.058333,0,0,0,0,0,3,0,0,0,0,0,0,10,6,2,2,0,35,35,0,10.7,10.7,0.0,2,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0,0,0,0.0,0,0,1,1,zero,0,300,65,0,0,1,0,phishing
2,http://www.avevaconstruction.com/blesstool/ima...,52,25,0,3,0,0,0,0,0,0,0,0,0,4,0,1,0,0,0,0,1,0,0,0,1,0.0,0.0,0,0,0,0,0,3,0,0,0,0,1,0,5,5,3,3,3,17,17,9,7.4,10.0,5.666667,0,0,0,0,0,0,3,1.0,0.0,0,0,0,0.0,0,0.0,0,0,100.0,0,0.0,0.0,0,0,0,0.0,0,0,0,1,zero,0,119,1707,0,0,1,0,phishing
3,http://www.jp519.com/,21,13,0,2,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,0,1,0,0,0,1,0.142857,0.230769,0,0,0,0,0,2,0,1,0,0,0,0,2,3,3,3,0,5,5,0,4.0,4.0,0.0,0,0,0,0,0,0,404,0.962871,0.037129,0,0,0,0.133333,0,0.0,0,0,100.0,0,92.307692,7.692308,0,0,0,82.539683,0,0,0,1,one,0,130,1331,0,0,0,0,legitimate
4,https://www.velocidrone.com/,28,19,0,2,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,0,1,0,0,0,0,0.0,0.0,0,0,0,0,0,2,0,0,0,0,0,0,2,3,3,3,0,11,11,0,7.0,7.0,0.0,0,0,0,0,0,0,57,0.684211,0.315789,0,3,0,0.0,0,0.0,0,1,55.555556,0,50.0,50.0,0,0,0,81.081081,0,0,0,0,zero,0,164,1662,312044,0,0,4,legitimate


## Dataset 10

https://www.kaggle.com/tlhcelik/website-phishing-dataset

## Dataset 11

https://www.kaggle.com/kunal4892/phishingandlegitimateurls

In [84]:
dataset_11 = pd.read_csv('./datasets_for_project/dataset_11.csv')
info_extract(dataset_11, 'label', 1, 0, raw_data=True)

######### FEATURES #########
Index(['domain', 'ranking', 'isIp', 'valid', 'activeDuration', 'urlLen', 'is@',
       'isredirect', 'haveDash', 'domainLen', 'nosOfSubdomain', 'label'],
      dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 95910
Malicious Samples = 55914
Benign Samples = 39996


Unnamed: 0,domain,ranking,isIp,valid,activeDuration,urlLen,is@,isredirect,haveDash,domainLen,nosOfSubdomain,label
0,www.voting-yahoo.com,10000000,0,0,0,20,0,0,1,20,2,1
1,www.zvon.org/xxl/WSDL1.1/Output/index.html,194914,0,1,7305,42,0,0,0,12,2,0
2,tecportais.com/file-security-update-infonfmati...,10000000,0,0,0,155,0,0,0,14,1,1
3,bima.astro.umd.edu/nemo/linuxastro/,7001,0,0,0,35,0,0,0,18,3,0
4,huarui-tec.com/js/?us.battle.net/login/en/?ref...,10000000,0,1,730,79,0,0,1,14,1,1


## Dataset 12

https://www.kaggle.com/aktank/url-detection

In [93]:
dataset_12 = pd.read_csv('./datasets_for_project/dataset_12.txt', delimiter=',', header=None)
dataset_12.columns = ['URL', 'Label']
info_extract(dataset_12, 'Label', 1, 0, raw_data=True)

######### FEATURES #########
Index(['URL', 'Label'], dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 858
Malicious Samples = 330
Benign Samples = 528


Unnamed: 0,URL,Label
0,http://google.com,0
1,http://facebook.com,0
2,http://youtube.com,0
3,http://yahoo.com,0
4,http://baidu.com,0


## Dataset 13

https://www.kaggle.com/murataltay3504/phishing

In [94]:
dataset_13 = pd.read_csv('./datasets_for_project/dataset_13.csv')
info_extract(dataset_13, 'Result', -1, 1)

######### FEATURES #########
Index(['id', 'having_IP_Address', 'URL_Length', 'Shortining_Service',
       'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix',
       'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length',
       'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor',
       'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL',
       'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe',
       'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank',
       'Google_Index', 'Links_pointing_to_page', 'Statistical_report',
       'Result'],
      dtype='object')


######### Dataset details #########
No raw data
Total number of samples = 11055
Malicious Samples = 4898
Benign Samples = 6157


Unnamed: 0,id,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,port,HTTPS_token,Request_URL,URL_of_Anchor,Links_in_tags,SFH,Submitting_to_email,Abnormal_URL,Redirect,on_mouseover,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,-1,1,-1,1,-1,-1,-1,0,1,1,1,1,-1,-1,-1,-1,1,1,-1,-1
1,2,1,1,1,1,1,-1,0,1,-1,1,1,-1,1,0,-1,-1,1,1,0,1,1,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,1,1,-1,1,0,-1,-1,-1,-1,0,1,1,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,1,1,-1,-1,0,0,-1,1,1,0,1,1,1,1,-1,-1,1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,1,1,-1,1,1,1,1,0,0,-1,1,1,0,-1,1,-1,1,-1,-1,0,-1,1,1,1,1


## Dataset 14

https://www.kaggle.com/pkylas/phishing-dataset-from-sep-0124

In [101]:
dataset_14_1 = pd.read_csv('./datasets_for_project/dataset_14/Phish101.csv')
dataset_14_2 = pd.read_csv('./datasets_for_project/dataset_14/Phish102.csv')

print('######### FEATURES #########')
print(dataset_14_1.columns)
print('\n')
print('######### Dataset details #########')
print('Contains raw data')
print(f'Total number of Samples (all malicious) = {len(dataset_14_1) + len(dataset_14_2)}')
dataset_14_1.head()

######### FEATURES #########
Index(['URL_name', 'URL_Status', 'URL_SubmittedBy', 'URL_SubmittedBy_url'], dtype='object')


######### Dataset details #########
Contains raw data
Total number of Samples (all malicious) = 2180


Unnamed: 0,URL_name,URL_Status,URL_SubmittedBy,URL_SubmittedBy_url
0,http://www.omeda.pl/ortopedia/tpl/numt/\nadded...,Offline,GovCERTCH,https://www.phishtank.com/user.php?username=Go...
1,http://griffithphoto.com/dropbox.html\nadded o...,ONLINE,SWITCHCERT,https://www.phishtank.com/user.php?username=SW...
2,http://www.shipchandlerdurban.com/wpbtcblockch...,ONLINE,SWITCHCERT,https://www.phishtank.com/user.php?username=SW...
3,http://jakuboweb.com/dropbox.html\nadded on Se...,Offline,SWITCHCERT,https://www.phishtank.com/user.php?username=SW...
4,https://sumtsel.com/secure/index.php\nadded on...,Offline,SWITCHCERT,https://www.phishtank.com/user.php?username=SW...


## Dataset 15

https://github.com/faizann24/Using-machine-learning-to-detect-malicious-URLs/tree/master/data

In [113]:
dataset_15_1 = pd.read_csv('./datasets_for_project/dataset_15/data.csv')
dataset_15_2 = pd.read_csv('./datasets_for_project/dataset_15/data2.csv', header=None)
dataset_15_2.columns = ['url', 'label']
dataset_15 = pd.concat([dataset_15_1, dataset_15_2], axis=0)

info_extract(dataset_15, 'label', 'bad', 'good', raw_data=True)

######### FEATURES #########
Index(['url', 'label'], dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 453340
Malicious Samples = 108519
Benign Samples = 344821


Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


## Dataset 16

https://archive.ics.uci.edu/ml/datasets/Website+Phishing

In [114]:
dataset_16_arff = arff.loadarff('./datasets_for_project/dataset_16.arff')
dataset_16 = pd.DataFrame(dataset_16_arff[0])

In [116]:
info_extract(dataset_16, 'Result', b'1', b'0')

######### FEATURES #########
Index(['SFH', 'popUpWidnow', 'SSLfinal_State', 'Request_URL', 'URL_of_Anchor',
       'web_traffic', 'URL_Length', 'age_of_domain', 'having_IP_Address',
       'Result'],
      dtype='object')


######### Dataset details #########
No raw data
Total number of samples = 1353
Malicious Samples = 548
Benign Samples = 103


Unnamed: 0,SFH,popUpWidnow,SSLfinal_State,Request_URL,URL_of_Anchor,web_traffic,URL_Length,age_of_domain,having_IP_Address,Result
0,b'1',b'-1',b'1',b'-1',b'-1',b'1',b'1',b'1',b'0',b'0'
1,b'-1',b'-1',b'-1',b'-1',b'-1',b'0',b'1',b'1',b'1',b'1'
2,b'1',b'-1',b'0',b'0',b'-1',b'0',b'-1',b'1',b'0',b'1'
3,b'1',b'0',b'1',b'-1',b'-1',b'0',b'1',b'1',b'0',b'0'
4,b'-1',b'-1',b'1',b'-1',b'0',b'0',b'-1',b'1',b'0',b'1'


## Dataset 17

https://archive.ics.uci.edu/ml/datasets/phishing+websites

Something in the arff file is making it throw an error when loading, but visually inspecting the data, it's similar to dataset 16 and doesn't seem worth the hassle, but kept it for reference.

In [128]:
# dataset_17_arff = arff.loadarff('./datasets_for_project/dataset_17/dataset_17.arff')
# dataset_17 = pd.DataFrame(dataset_17_arff[0])

## Dataset 18

https://bigml.com/user/jbosca/gallery/dataset/5a815bffeba31d64150002d9#info

In [127]:
dataset_18 = pd.read_csv('./datasets_for_project/dataset_18.csv')
info_extract(dataset_18, 'TIPO', 'Maligna', 'Benigna')

######### FEATURES #########
Index(['URL', 'URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CHARSET', 'SERVER',
       'CACHE_CONTROL', 'CONTENT_LENGTH', 'WHOIS_COUNTRY', 'WHOIS_STATEPROV',
       'WHOIS_REGDATE', 'UPDATE_DATE', 'WHITIN_DOMAIN',
       'TCP_CONVERSATION_EXCHANGE', 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS',
       'APP_BYTES', 'UDP_PACKETS', 'TCP_URG_PACKETS', 'SOURCE_APP_PACKETS',
       'REMOTE_APP_PACKETS', 'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES',
       'APP_PACKETS', 'DNS_QUERY_TIMES', 'TIPO', 'WHOIS_REGDATE.year',
       'WHOIS_REGDATE.month', 'WHOIS_REGDATE.day-of-month',
       'WHOIS_REGDATE.day-of-week', 'WHOIS_REGDATE.hour',
       'WHOIS_REGDATE.minute', 'WHOIS_REGDATE.second',
       'WHOIS_REGDATE.millisecond', 'UPDATE_DATE.year', 'UPDATE_DATE.month',
       'UPDATE_DATE.day-of-month', 'UPDATE_DATE.day-of-week',
       'UPDATE_DATE.hour', 'UPDATE_DATE.minute', 'UPDATE_DATE.second',
       'UPDATE_DATE.millisecond'],
      dtype='object')


######### Dataset details #

Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CACHE_CONTROL,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATEPROV,WHOIS_REGDATE,UPDATE_DATE,WHITIN_DOMAIN,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,UDP_PACKETS,TCP_URG_PACKETS,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,TIPO,WHOIS_REGDATE.year,WHOIS_REGDATE.month,WHOIS_REGDATE.day-of-month,WHOIS_REGDATE.day-of-week,WHOIS_REGDATE.hour,WHOIS_REGDATE.minute,WHOIS_REGDATE.second,WHOIS_REGDATE.millisecond,UPDATE_DATE.year,UPDATE_DATE.month,UPDATE_DATE.day-of-month,UPDATE_DATE.day-of-week,UPDATE_DATE.hour,UPDATE_DATE.minute,UPDATE_DATE.second,UPDATE_DATE.millisecond
0,M0_109,16,7,iso-8859-1,nginx,,263.0,,,10/10/2015 18:21,,wp9.ru,7,0,2,700,0,0,9,10,1153,832,9,2.0,Maligna,2015.0,10.0,10.0,6.0,18.0,21.0,0.0,0.0,,,,,,,,
1,B0_2314,16,6,UTF-8,Apache/2.4.10,"max-age=0, must-revalidate, public, s-maxage=8...",15087.0,,,,,siff.bg,17,7,4,1230,0,0,17,19,1265,1230,17,0.0,Benigna,,,,,,,,,,,,,,,,
2,B0_911,16,6,us-ascii,Microsoft-HTTPAPI/2.0,,324.0,,,,,esu.edu,0,0,0,0,0,0,0,0,0,0,0,0.0,Benigna,,,,,,,,,,,,,,,,
3,B0_113,17,6,ISO-8859-1,nginx,,162.0,US,AK,7/10/1997 4:00,12/09/2013 0:45,aste.org,31,22,3,3812,0,0,39,37,18784,4380,39,8.0,Benigna,1997.0,7.0,10.0,4.0,4.0,0.0,0.0,0.0,2013.0,12.0,9.0,1.0,0.0,45.0,0.0,0.0
4,B0_403,17,6,UTF-8,,"max-age=269, must-revalidate",124140.0,US,TX,12/05/1996 0:00,11/04/2017 0:00,kksf.com,57,2,5,4278,0,0,61,62,129889,4586,61,4.0,Benigna,1996.0,12.0,5.0,4.0,0.0,0.0,0.0,0.0,2017.0,11.0,4.0,6.0,0.0,0.0,0.0,0.0


## Dataset 19

https://www.kaggle.com/lsingh4/malicious-and-benign-websites

In [130]:
dataset_19 = pd.read_csv('./datasets_for_project/dataset_19.csv')
info_extract(dataset_19, 'TIPO', 'Maligna', 'Benigna')

######### FEATURES #########
Index(['URL', 'URL_LENGTH', 'NUMBER_SPECIAL_CHARACTERS', 'CHARSET', 'SERVER',
       'CACHE_CONTROL', 'CONTENT_LENGTH', 'WHOIS_COUNTRY', 'WHOIS_STATE_CITY',
       'WHOIS_REG_YEAR', 'UPDATE_YEAR', 'DOMAIN_NAME',
       'TCP_CONVERSATION_EXCHANGE', 'DIST_REMOTE_TCP_PORT', 'REMOTE_IPS',
       'APP_BYTES', 'UDP_PACKETS', 'SOURCE_APP_PACKETS', 'REMOTE_APP_PACKETS',
       'SOURCE_APP_BYTES', 'REMOTE_APP_BYTES', 'APP_PACKETS',
       'DNS_QUERY_TIMES', 'TIPO'],
      dtype='object')


######### Dataset details #########
No raw data
Total number of samples = 1611
Malicious Samples = 158
Benign Samples = 1453


Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CACHE_CONTROL,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATE_CITY,WHOIS_REG_YEAR,UPDATE_YEAR,DOMAIN_NAME,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,UDP_PACKETS,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,TIPO
0,B0_113,17,6,ISO-8859-1,nginx,,162,US,California,1997,2013,aste.org,31,22,3,3812,0,39,37,18784,4380,39,8,Benigna
1,B0_403,17,6,UTF-8,Apache/2.2.0 (Fedora),"max-age=269, must-revalidate",124140,US,Texas,1996,2017,kksf.com,57,2,5,4278,0,61,62,129889,4586,61,4,Benigna
2,B0_2064,18,7,UTF-8,nginx,,11260,CN,Shanxi,2017,2017,gl-ks.org,11,6,9,894,0,11,13,838,894,11,0,Benigna
3,B0_462,18,6,iso-8859-1,Apache/2,,345,US,Colorado,2002,2016,wsims.com,12,0,3,1189,0,14,13,8559,1327,14,2,Benigna
4,B0_1128,19,6,us-ascii,Microsoft-HTTPAPI/2.0,,324,US,Florida,1997,2017,hfcuvt.com,0,0,0,0,0,0,0,0,0,0,0,Benigna


## Dataset 20

https://www.kaggle.com/aksingh2411/dataset-of-malicious-and-benign-webpages

**A strong dataset, we have features and can add more, a lot of the websites are live for image extraction, and the raw URLS will help with unsupervised learning methods, to balance that data, it might be useful to add URLS from other datasets. This might be a good starting point.**

### Citations and uses

[1] Singh, A. K., and Navneet Goyal. "MalCrawler: A crawler for seeking and crawling malicious websites." In International Conference on Distributed Computing and Internet Technology, pp. 210-223. Springer, Cham, 2017.

[2] https://developers.google.com/safe-browsing


[3] Singh, A. K., and Navneet Goyal. "A Comparison of Machine Learning Attributes for Detecting Malicious Websites." In 2019 11th International Conference on Communication Systems & Networks (COMSNETS), pp. 352-358. IEEE, 2019.

In [180]:
dataset_20_test = pd.read_csv('./datasets_for_project/dataset_20/\
Webpages_Classification_test_data.csv/Webpages_Classification_test_data.csv')

dataset_20_train = pd.read_csv('./datasets_for_project/dataset_20/\
Webpages_Classification_train_data.csv/Webpages_Classification_train_data.csv')

dataset_20 = pd.concat([dataset_20_train, dataset_20_test], axis=0).drop(columns=['Unnamed: 0'])

In [181]:
dataset_20.head()

Unnamed: 0,url,url_len,ip_add,geo_loc,tld,who_is,https,js_len,js_obf_len,content,label
0,http://members.tripod.com/russiastation/,40,42.77.221.155,Taiwan,com,complete,yes,58.0,0.0,Named themselves charged particles in a manly ...,good
1,http://www.ddj.com/cpp/184403822,32,3.211.202.180,United States,com,complete,yes,52.5,0.0,And filipino field \n \n \n \n \n \n \n \n the...,good
2,http://www.naef-usa.com/,24,24.232.54.41,Argentina,com,complete,yes,103.5,0.0,"Took in cognitivism, whose adherents argue for...",good
3,http://www.ff-b2b.de/,21,147.22.38.45,United States,de,incomplete,no,720.0,532.8,fire cumshot sodomize footaction tortur failed...,bad
4,http://us.imdb.com/title/tt0176269/,35,205.30.239.85,United States,com,complete,yes,46.5,0.0,"Levant, also monsignor georges. In 1800, lists...",good


In [182]:
info_extract(dataset_20, 'label', 'bad', 'good', raw_data=True)

######### FEATURES #########
Index(['url', 'url_len', 'ip_add', 'geo_loc', 'tld', 'who_is', 'https',
       'js_len', 'js_obf_len', 'content', 'label'],
      dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 1561934
Malicious Samples = 35315
Benign Samples = 1526619


Unnamed: 0,url,url_len,ip_add,geo_loc,tld,who_is,https,js_len,js_obf_len,content,label
0,http://members.tripod.com/russiastation/,40,42.77.221.155,Taiwan,com,complete,yes,58.0,0.0,Named themselves charged particles in a manly ...,good
1,http://www.ddj.com/cpp/184403822,32,3.211.202.180,United States,com,complete,yes,52.5,0.0,And filipino field \n \n \n \n \n \n \n \n the...,good
2,http://www.naef-usa.com/,24,24.232.54.41,Argentina,com,complete,yes,103.5,0.0,"Took in cognitivism, whose adherents argue for...",good
3,http://www.ff-b2b.de/,21,147.22.38.45,United States,de,incomplete,no,720.0,532.8,fire cumshot sodomize footaction tortur failed...,bad
4,http://us.imdb.com/title/tt0176269/,35,205.30.239.85,United States,com,complete,yes,46.5,0.0,"Levant, also monsignor georges. In 1800, lists...",good


## Dataset 21

https://www.kaggle.com/deepak730/finding-malicious-url-through-url-features?select=url_features.csv

In [142]:
dataset_21 = pd.read_csv('./datasets_for_project/dataset_21/url_features.csv')

info_extract(dataset_21, 'malicious', 1, 0, raw_data=True)

######### FEATURES #########
Index(['token_count', 'rank_host', 'rank_country', 'ASNno', 'sec_sen_word_cnt',
       'avg_token_length', 'No_of_dots', 'malicious', 'Length_of_url',
       'avg_path_token', 'IPaddress_presence', 'Length_of_host',
       'safebrowsing', 'URL', 'host', 'avg_domain_token_length',
       'path_token_count', 'path', 'largest_domain', 'domain_token_count',
       'largest_path', 'largest_token'],
      dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 830
Malicious Samples = 49
Benign Samples = 528


Unnamed: 0,token_count,rank_host,rank_country,ASNno,sec_sen_word_cnt,avg_token_length,No_of_dots,malicious,Length_of_url,avg_path_token,IPaddress_presence,Length_of_host,safebrowsing,URL,host,avg_domain_token_length,path_token_count,path,largest_domain,domain_token_count,largest_path,largest_token
0,3,1,1,15169,0,4.333333,1,0,17,0.0,0,10,0,http://google.com,google.com,4.5,0,,6,2,0,6
1,3,2,2,32934,0,5.0,1,0,19,0.0,0,12,0,http://facebook.com,facebook.com,5.5,0,,8,2,0,8
2,3,3,3,15169,0,4.666667,1,0,18,0.0,0,11,0,http://youtube.com,youtube.com,5.0,0,,7,2,0,7
3,3,4,4,36647,0,4.0,1,0,16,0.0,0,9,0,http://yahoo.com,yahoo.com,4.0,0,,5,2,0,5
4,3,6,1,23724,0,4.0,1,0,16,0.0,0,9,0,http://baidu.com,baidu.com,4.0,0,,5,2,0,5


## Dataset 22

https://www.kaggle.com/deepsworld/malicous-and-benign-websites

In [145]:
dataset_22 = pd.read_csv('./datasets_for_project/dataset_22.csv')

info_extract(dataset_22, 'label', 1, 0, raw_data=True)

######### FEATURES #########
Index(['url', 'label'], dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 3655314
Malicious Samples = 1213089
Benign Samples = 2442225


Unnamed: 0,url,label
0,https://www.drive.google.com,0
1,www.gmail.google.com,0
2,http://facebook.com,0
3,https://yahoo.com,0
4,001web.net,1


## Dataset 23

https://www.kaggle.com/sharecodeweb/url-malicious-lstm

In [185]:
from glob import glob as globlin

dataframes = []

dataset_23_paths = globlin('./datasets_for_project/dataset_23/*.csv')
for path in dataset_23_paths:
    dataframes.append(pd.read_csv(path).drop(columns=['Unnamed: 0']))

dataset_23 = pd.concat(dataframes, axis=0, sort=True)

In [186]:
info_extract(dataset_23, 'label', 'bad', 'good', raw_data=True)

######### FEATURES #########
Index(['label', 'url'], dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 391000
Malicious Samples = 195500
Benign Samples = 195500


Unnamed: 0,label,url
0,bad,kiwipainting.co.nz/quotegenerator/quoteimages/...
1,good,azstuntspecialties-larkinracing.com/az_stunts-...
2,good,archiver.rootsweb.ancestry.com/th/read/METIS/2...
3,bad,xyleo.co.uk/new/txzx
4,bad,eshopsstar.com/lookingforpcaidonline/


## Dataset 24

https://github.com/ebubekirbbr/pdd/tree/master/input

In [183]:
dataset_24_legit = pd.read_json('./datasets_for_project/dataset_24/data_legitimate_36400.json')
dataset_24_legit.columns = ['URL']
dataset_24_legit['Label'] = 0

dataset_24_phishing = pd.read_json('./datasets_for_project/dataset_24/data_phishing_37175.json')
dataset_24_phishing.columns = ['URL']
dataset_24_phishing['Label'] = 1

dataset_24 = pd.concat([dataset_24_legit, dataset_24_phishing], axis=0)

In [184]:
info_extract(dataset_24, 'Label', 1, 0, raw_data=True)

######### FEATURES #########
Index(['URL', 'Label'], dtype='object')


######### Dataset details #########
Contains raw data
Total number of samples = 73575
Malicious Samples = 37175
Benign Samples = 36400


Unnamed: 0,URL,Label
0,http://www.conceptdraw.com/How-To-Guide/Local-...,0
1,https://www.edrawsoft.com/Local-Area-Network.php,0
2,http://www.webopedia.com/TERM/L/local_area_net...,0
3,https://www.acsac.org/secshelf/book001/16.pdf,0
4,http://www.diffen.com/difference/LAN_vs_WAN,0


## Dataset 25

https://web.cs.hacettepe.edu.tr/~selman/phish-iris-dataset/

Paper link

https://arxiv.org/pdf/1905.07767.pdf