# **Import needed libraries**

In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import re
import requests
import socket
from urllib.parse import urlparse

# **Import Dataset**

In [22]:
df = pd.read_csv(filepath_or_buffer='phishing_site_urls.csv', header=0)
len(df)

549346

Data preparation
- Get Top-Level-Domain
- Get IP-Address
- Get External Score
- Get Length of URL
- Get Amount of .
- Get Amount of /
- Get if a IP-Address is included in the URL
- Get Amount of Special Characters

Unshorten URLs

new_column = []    
for values in df['URL']:
    if not re.match(r'^((http|https)://)', values):
        new_column.append("http://"+values)
    else:
        new_column.append(values)

df['http_domain'] = new_column

**Delete all rows that contain IP-addresses**

In [23]:
ip_regex = re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b')


ip_rows = df['URL'].str.match(ip_regex)


df = df[~ip_rows]

**Get Domain of every URL**

In [24]:
new_column = []    
for values in df['URL']:
    new_column.append(re.search(r'^((http|https)://)?[^(/|\s|:]*', values).group())

df['domain'] = new_column

**Get TLD of every URL -> if not possible to allocate delete the row**

In [25]:
new_column = []    
i = 1
for values in df['domain']:
    try:
        new_column.append(re.search(r'(?<=\.)[a-z]+(?=\/|$)', values).group())
    except:
        new_column.append("n/a")

df['tld'] = new_column

In [26]:
#Delete rows without a TLD
na_rows = df.loc[df['tld'] == 'n/a']
df = df.drop(na_rows.index)

**Get length of URL**

In [36]:
new_column = []    
for values in df['URL']:
    new_column.append(len(values))

df['length'] = new_column

**Get Amount of '.'**

In [27]:
new_column = []    
for values in df['URL']:
    new_column.append(values.count("."))

df['amount_dots'] = new_column

**Get Amount of '/'**

In [28]:
new_column = []    
for values in df['URL']:
    new_column.append(values.count("/"))

df['amount_slash'] = new_column

**Get Amount of '#'**

In [29]:
new_column = []    
for values in df['URL']:
    new_column.append(values.count("="))

df['amount_same'] = new_column

**Get amount of '@'**

In [30]:
new_column = []    
for values in df['URL']:
    new_column.append(values.count("@"))

df['amount_at'] = new_column

**Get amount of '-'**

In [31]:
new_column = []    
for values in df['URL']:
    new_column.append(values.count("-"))

df['amount_hypthon'] = new_column

**Get amount of subdomains**

In [32]:
new_column = []    
for values in df['domain']:
    subdomains = values.split(".")[:-2] 
    new_column.append(len(subdomains))

df['amount_subdomains'] = new_column

**Get amount of directories**

In [40]:
new_column = []    
for values in df['URL']:
    new_column.append(len(urlparse(values).path.split('/')) - 1)

df['amount_directories'] = new_column


Redirect

In [54]:
new_column = []    
for values in df['URL']:
    response = requests.get(values, allow_redirects=False)

    if response.status_code >= 300 and response.status_code < 400:
        new_column.append("1")
    else:
        new_column.append("0")

df['redirect_flag'] = new_column

MissingSchema: Invalid URL 'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526': No scheme supplied. Perhaps you meant http://nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526?

In [41]:
df.head(15)

Unnamed: 0,URL,Label,domain,tld,amount_dots,amount_slash,amount_same,amount_at,amount_hypthon,amount_subdomains,length,amount_directories
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,nobell.it,it,225,10,4,0,4,0,225,8
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,www.dghjdgf.com,com,81,4,2,0,2,1,81,4
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,serviciosbys.com,com,177,11,0,0,1,0,177,11
3,mail.printakid.com/www.online.americanexpress....,bad,mail.printakid.com,com,60,2,0,0,0,1,60,2
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,thewhiskeydregs.com,com,116,10,0,0,1,0,116,7
5,smilesvoegol.servebbs.org/voegol.php,bad,smilesvoegol.servebbs.org,org,36,1,0,0,0,1,36,1
6,premierpaymentprocessing.com/includes/boleto-2...,bad,premierpaymentprocessing.com,com,61,2,0,0,3,0,61,2
7,myxxxcollection.com/v1/js/jih321/bpd.com.do/do...,bad,myxxxcollection.com,com,60,6,0,0,0,0,60,6
8,super1000.info/docs,bad,super1000.info,info,19,1,0,0,0,0,19,1
9,horizonsgallery.com/js/bin/ssl1/_id/www.paypal...,bad,horizonsgallery.com,com,193,10,3,0,3,0,193,10
