In [94]:
import numpy as np
import pandas as pd
import json

In [95]:
import tldextract

### Reading legitimate url dataset

In [96]:
lg = pd.read_json('data_legitimate_36400.json')
lg.head()

Unnamed: 0,0
0,http://www.conceptdraw.com/How-To-Guide/Local-...
1,https://www.edrawsoft.com/Local-Area-Network.php
2,http://www.webopedia.com/TERM/L/local_area_net...
3,https://www.acsac.org/secshelf/book001/16.pdf
4,http://www.diffen.com/difference/LAN_vs_WAN


### Reading Phishing url dataset

In [97]:
ph = pd.read_json('data_phishing_37175.json')
ph.head()

Unnamed: 0,0
0,apple-iforget.com
1,safe-id-login.com
2,manage.netflix.com.usermanagement.key.1973573....
3,manage.netflix.com.usermanagement.key.1973574....
4,helpplusinfo01.com


### Changing column name

In [98]:
lg.rename(columns = {0 : 'URLs'}, inplace = True)
print (len(lg))
lg.head()

36400


Unnamed: 0,URLs
0,http://www.conceptdraw.com/How-To-Guide/Local-...
1,https://www.edrawsoft.com/Local-Area-Network.php
2,http://www.webopedia.com/TERM/L/local_area_net...
3,https://www.acsac.org/secshelf/book001/16.pdf
4,http://www.diffen.com/difference/LAN_vs_WAN


In [99]:
ph.rename(columns = {0 : 'URLs'}, inplace = True)
ph.head()

Unnamed: 0,URLs
0,apple-iforget.com
1,safe-id-login.com
2,manage.netflix.com.usermanagement.key.1973573....
3,manage.netflix.com.usermanagement.key.1973574....
4,helpplusinfo01.com


### Adding a column 'isPhishing' to store the Phishing Result

#### isPhishing = 0 for legitimate websites
#### isPhishing = 1 for phishing websites


In [100]:
lg["isPhishing"] = 0
lg.sort_values('URLs', inplace = True)
lg.drop_duplicates(inplace = True, subset = 'URLs')
lg.head()


Unnamed: 0,URLs,isPhishing
21891,http://0cu203.bankruptcy-law-help.com/15593752...,0
21795,http://1-drivers.org/videocards/diamond_multim...,0
5135,http://1000islandswritersfestival.ca/wp-conten...,0
603,http://1000projects.org/fundamentals-of-passiv...,0
7998,http://1000projects.org/technical-seminar-topi...,0


In [101]:
ph["isPhishing"] = 1
ph.sort_values('URLs', inplace = True)

ph.drop_duplicates(inplace = True, subset = 'URLs')
ph.head()

Unnamed: 0,URLs,isPhishing
111,13bmosecure.com,1
155,34zy2.sljtm.com,1
108,accountverification.online,1
50,apaypal.co/Secure/PP/Paypal/,1
33,app-findmyiphone.com,1


### Merging the 2 dataframes into a single dataframe for pre-processing

In [102]:
df = pd.concat([ph, lg])

print (len(df))
df.head(10)

54763


Unnamed: 0,URLs,isPhishing
111,13bmosecure.com,1
155,34zy2.sljtm.com,1
108,accountverification.online,1
50,apaypal.co/Secure/PP/Paypal/,1
33,app-findmyiphone.com,1
142,apple-accounts.me/clients/home/,1
39,apple-icloudcid.com,1
54,apple-id-support.info,1
20359,apple-iforget.com,1
166,apple-inc-help.com,1


## Feature Selection

### Calculating raw length of url, maximum, minimum, average and standard deviation of the length of words

In [103]:
import re, statistics
def rawlength(s):
    return len(s)

def lenMaxWord(s):
    words = re.split('[. -/_]',s)
    m = 0
    for i in words:
        m = max(m, len(i))
    return m


def lenAvgWord(s):
    words = re.split('[. -/_]',s)
    l = []
    for i in words:
        if (len(i)>0):
            l.append(len(i))
    return round(sum(l)/len(l))

def lenMinWord(s):
    words = re.split('[. -/_]',s)
    l = []
    for i in words:
        if (len(i)>0):
            l.append(len(i))
    return min(l)

def SDlengthWord(s):
    words = re.split('[. -/_]',s)
    l = []
    for i in words:
        if (len(i)>0):
            l.append(len(i))
    return statistics.stdev(l)


df["RawLength"] = df["URLs"].apply(rawlength)
df["largestWord"] = df["URLs"].apply(lenMaxWord)
df["avgWord"] = df["URLs"].apply(lenAvgWord)
df["smallestWord"] = df["URLs"].apply(lenMinWord)
df["SDlength"] = df["URLs"].apply(SDlengthWord)
df.head(10)

Unnamed: 0,URLs,isPhishing,RawLength,largestWord,avgWord,smallestWord,SDlength
111,13bmosecure.com,1,15,11,7,3,5.656854
155,34zy2.sljtm.com,1,15,5,4,3,1.154701
108,accountverification.online,1,26,19,12,6,9.192388
50,apaypal.co/Secure/PP/Paypal/,1,28,7,5,2,2.408319
33,app-findmyiphone.com,1,20,12,6,3,5.196152
142,apple-accounts.me/clients/home/,1,31,8,5,2,2.387467
39,apple-icloudcid.com,1,19,9,6,3,3.05505
54,apple-id-support.info,1,21,7,4,2,2.081666
20359,apple-iforget.com,1,17,7,5,3,2.0
166,apple-inc-help.com,1,18,5,4,3,0.957427


### Check if url starts with HTTPS, has www and ends with .com

In [104]:
def isHTTPS(s):
    if(s.startswith('https')):
        return 1
    else:
        return 0

def hasWWW(s):
    if 'www' in s:
        return 1
    else:
        return 0

def hasCOM(s):
    if '.com' in s:
        return 1
    else:
        return 0
    
df["HTTPS"] = df["URLs"].apply(isHTTPS)
df["WWW"] = df["URLs"].apply(hasWWW)
df[".COM"] = df["URLs"].apply(hasCOM)

#### Checking number of subdomains, length of domain, length of subdomain, count of number of digits in domain and subdomain

In [105]:
def numOfSubdom(site):
    s = re.search("//",site)
    if s:
        site = site[s.end():]
    e = re.search("/",site)
    if e:
        site = site[:e.end()-1]
    site = site.split(".")
    # one for domain extension eg. com,edu,in 
    num = len(site)-1
    if 'www' in site:
        num = num-1
    return num

def lendomain(site):
    res = tldextract.extract(site)
    domain = res.domain
    return (len(domain))

def lenSubdomain(site):
    res = tldextract.extract(site)
    subdomain = res.subdomain
    return (len(subdomain))

def DigDomain(site):
    res = tldextract.extract(site)
    domain = res.domain
    count = 0
    for i in domain:
        if i.isdigit():
            count+=1
    return count

def DigSubDomain(site):
    res = tldextract.extract(site)
    subdomain = res.subdomain
    count = 0
    for i in subdomain:
        if i.isdigit():
            count+=1
    return count

df["subdomainLen"] = df["URLs"].apply(lenSubdomain)
df["domainLen"] = df["URLs"].apply(lendomain)
df["countSubDomain"] = df["URLs"].apply(numOfSubdom)
df["countDigitDom"] = df["URLs"].apply(DigDomain)
df["countDigitSub"] = df["URLs"].apply(DigSubDomain)
df.head()

Unnamed: 0,URLs,isPhishing,RawLength,largestWord,avgWord,smallestWord,SDlength,HTTPS,WWW,.COM,subdomainLen,domainLen,countSubDomain,countDigitDom,countDigitSub
111,13bmosecure.com,1,15,11,7,3,5.656854,0,0,1,0,11,1,2,0
155,34zy2.sljtm.com,1,15,5,4,3,1.154701,0,0,1,5,5,2,0,3
108,accountverification.online,1,26,19,12,6,9.192388,0,0,0,0,19,1,0,0
50,apaypal.co/Secure/PP/Paypal/,1,28,7,5,2,2.408319,0,0,0,0,7,1,0,0
33,app-findmyiphone.com,1,20,12,6,3,5.196152,0,0,1,0,16,1,0,0


#### Load Brand Names

In [106]:
brand = open("BrandName.txt", "r")
brandname = brand.read()
brands = brandname.split('\n')

#### Check if a brandname is in URL and find count of brandnames

In [107]:
def checkBrandName(s):
    for i in brands:
        if i.lower() in s.lower():
            return 1
    return 0

def countBrandName(s):
    count = 0
    for i in brands:
        if i.lower() in s.lower():
            count +=1
            
    return count

df["brandName"] = df["URLs"].apply(checkBrandName)
df["countBrand"] = df["URLs"].apply(countBrandName)
df.head()

Unnamed: 0,URLs,isPhishing,RawLength,largestWord,avgWord,smallestWord,SDlength,HTTPS,WWW,.COM,subdomainLen,domainLen,countSubDomain,countDigitDom,countDigitSub,brandName,countBrand
111,13bmosecure.com,1,15,11,7,3,5.656854,0,0,1,0,11,1,2,0,0,0
155,34zy2.sljtm.com,1,15,5,4,3,1.154701,0,0,1,5,5,2,0,3,0,0
108,accountverification.online,1,26,19,12,6,9.192388,0,0,0,0,19,1,0,0,0,0
50,apaypal.co/Secure/PP/Paypal/,1,28,7,5,2,2.408319,0,0,0,0,7,1,0,0,1,1
33,app-findmyiphone.com,1,20,12,6,3,5.196152,0,0,1,0,16,1,0,0,1,1


#### Loading keywords

In [108]:
key = open("keywords.txt", "r")
keys = key.read()
keywords = keys.split('\n')

#### Count of keywords

In [109]:
def countKeyword(s):
    count = 0
    for i in keywords:
        if i.lower() in s.lower():
            count +=1
            
    return count

df["keywordCount"] = df["URLs"].apply(countKeyword)
df.head(10)

Unnamed: 0,URLs,isPhishing,RawLength,largestWord,avgWord,smallestWord,SDlength,HTTPS,WWW,.COM,subdomainLen,domainLen,countSubDomain,countDigitDom,countDigitSub,brandName,countBrand,keywordCount
111,13bmosecure.com,1,15,11,7,3,5.656854,0,0,1,0,11,1,2,0,0,0,2
155,34zy2.sljtm.com,1,15,5,4,3,1.154701,0,0,1,5,5,2,0,3,0,0,1
108,accountverification.online,1,26,19,12,6,9.192388,0,0,0,0,19,1,0,0,0,0,3
50,apaypal.co/Secure/PP/Paypal/,1,28,7,5,2,2.408319,0,0,0,0,7,1,0,0,1,1,1
33,app-findmyiphone.com,1,20,12,6,3,5.196152,0,0,1,0,16,1,0,0,1,1,3
142,apple-accounts.me/clients/home/,1,31,8,5,2,2.387467,0,0,0,0,14,1,0,0,1,1,2
39,apple-icloudcid.com,1,19,9,6,3,3.05505,0,0,1,0,15,1,0,0,1,2,1
54,apple-id-support.info,1,21,7,4,2,2.081666,0,0,0,0,16,1,0,0,1,1,2
20359,apple-iforget.com,1,17,7,5,3,2.0,0,0,1,0,13,1,0,0,1,1,2
166,apple-inc-help.com,1,18,5,4,3,0.957427,0,0,1,0,14,1,0,0,1,1,4


#### Check for Top Level Domain

In [110]:
def checktld(site):
    result = [0,0,0,0]
    s = re.search("//",site)
    if s:
        site = site[s.end():]
    e = re.search("/",site)
    if e:
        path = site[e.end():]
        site = site[:e.end()-1]
    dot = re.search(".",site)
    site = site[dot.end()-1:]
    arr = re.split(r"\.",site)
    tld = arr[-1]
    www = arr[0]
    if tld in ["com","org","net","gov","in"]:
        return 1
    return 0

df["tld"] = df["URLs"].apply(checktld)


df.head(10)

Unnamed: 0,URLs,isPhishing,RawLength,largestWord,avgWord,smallestWord,SDlength,HTTPS,WWW,.COM,subdomainLen,domainLen,countSubDomain,countDigitDom,countDigitSub,brandName,countBrand,keywordCount,tld
111,13bmosecure.com,1,15,11,7,3,5.656854,0,0,1,0,11,1,2,0,0,0,2,1
155,34zy2.sljtm.com,1,15,5,4,3,1.154701,0,0,1,5,5,2,0,3,0,0,1,1
108,accountverification.online,1,26,19,12,6,9.192388,0,0,0,0,19,1,0,0,0,0,3,0
50,apaypal.co/Secure/PP/Paypal/,1,28,7,5,2,2.408319,0,0,0,0,7,1,0,0,1,1,1,0
33,app-findmyiphone.com,1,20,12,6,3,5.196152,0,0,1,0,16,1,0,0,1,1,3,1
142,apple-accounts.me/clients/home/,1,31,8,5,2,2.387467,0,0,0,0,14,1,0,0,1,1,2,0
39,apple-icloudcid.com,1,19,9,6,3,3.05505,0,0,1,0,15,1,0,0,1,2,1,1
54,apple-id-support.info,1,21,7,4,2,2.081666,0,0,0,0,16,1,0,0,1,1,2,0
20359,apple-iforget.com,1,17,7,5,3,2.0,0,0,1,0,13,1,0,0,1,1,2,1
166,apple-inc-help.com,1,18,5,4,3,0.957427,0,0,1,0,14,1,0,0,1,1,4,1


#### Read Alexa top 1000 sites


In [111]:
alexa = open("Alexa_top_1m.txt", "r")
alexatop = alexa.read()
alexatop1m = alexatop.split('\n')

#### Check if url is in Alexa top 1000 sites

In [112]:
def isinAlexa(s):
    if s in alexatop1m:
        print (1)
        return 1
    return 0
df["Alexa_top_1m"] = df["URLs"].apply(isinAlexa)
df.head(10)

Unnamed: 0,URLs,isPhishing,RawLength,largestWord,avgWord,smallestWord,SDlength,HTTPS,WWW,.COM,subdomainLen,domainLen,countSubDomain,countDigitDom,countDigitSub,brandName,countBrand,keywordCount,tld,Alexa_top_1m
111,13bmosecure.com,1,15,11,7,3,5.656854,0,0,1,0,11,1,2,0,0,0,2,1,0
155,34zy2.sljtm.com,1,15,5,4,3,1.154701,0,0,1,5,5,2,0,3,0,0,1,1,0
108,accountverification.online,1,26,19,12,6,9.192388,0,0,0,0,19,1,0,0,0,0,3,0,0
50,apaypal.co/Secure/PP/Paypal/,1,28,7,5,2,2.408319,0,0,0,0,7,1,0,0,1,1,1,0,0
33,app-findmyiphone.com,1,20,12,6,3,5.196152,0,0,1,0,16,1,0,0,1,1,3,1,0
142,apple-accounts.me/clients/home/,1,31,8,5,2,2.387467,0,0,0,0,14,1,0,0,1,1,2,0,0
39,apple-icloudcid.com,1,19,9,6,3,3.05505,0,0,1,0,15,1,0,0,1,2,1,1,0
54,apple-id-support.info,1,21,7,4,2,2.081666,0,0,0,0,16,1,0,0,1,1,2,0,0
20359,apple-iforget.com,1,17,7,5,3,2.0,0,0,1,0,13,1,0,0,1,1,2,1,0
166,apple-inc-help.com,1,18,5,4,3,0.957427,0,0,1,0,14,1,0,0,1,1,4,1,0


#### Checking for special characters in the urls

In [113]:
def dash(s):
    if '-' in s:
        return 1
    return 0

def dot(s):
    if '.' in s:
        return 1
    return 0

def slash(s):
    if '/' in s:
        return 1
    return 0

def at(s):
    if '@' in s:
        return 1
    return 0

def question(s):
    if '?' in s:
        return 1
    return 0

def ampersand(s):
    if '&' in s:
        return 1
    return 0

def equal(s):
    if '=' in s:
        return 1
    return 0

def underscore(s):
    if '=' in s:
        return 1
    return 0

df["underscore"] = df["URLs"].apply(underscore)
df["equal"] = df["URLs"].apply(equal)
df["ampersand"] = df["URLs"].apply(ampersand)
df["question"] = df["URLs"].apply(question)
df["at"] = df["URLs"].apply(at)
df["dash"] = df["URLs"].apply(dash)
df["dot"] = df["URLs"].apply(dot)
df["slash"] = df["URLs"].apply(slash)
df.head(10)

Unnamed: 0,URLs,isPhishing,RawLength,largestWord,avgWord,smallestWord,SDlength,HTTPS,WWW,.COM,...,tld,Alexa_top_1m,underscore,equal,ampersand,question,at,dash,dot,slash
111,13bmosecure.com,1,15,11,7,3,5.656854,0,0,1,...,1,0,0,0,0,0,0,0,1,0
155,34zy2.sljtm.com,1,15,5,4,3,1.154701,0,0,1,...,1,0,0,0,0,0,0,0,1,0
108,accountverification.online,1,26,19,12,6,9.192388,0,0,0,...,0,0,0,0,0,0,0,0,1,0
50,apaypal.co/Secure/PP/Paypal/,1,28,7,5,2,2.408319,0,0,0,...,0,0,0,0,0,0,0,0,1,1
33,app-findmyiphone.com,1,20,12,6,3,5.196152,0,0,1,...,1,0,0,0,0,0,0,1,1,0
142,apple-accounts.me/clients/home/,1,31,8,5,2,2.387467,0,0,0,...,0,0,0,0,0,0,0,1,1,1
39,apple-icloudcid.com,1,19,9,6,3,3.05505,0,0,1,...,1,0,0,0,0,0,0,1,1,0
54,apple-id-support.info,1,21,7,4,2,2.081666,0,0,0,...,0,0,0,0,0,0,0,1,1,0
20359,apple-iforget.com,1,17,7,5,3,2.0,0,0,1,...,1,0,0,0,0,0,0,1,1,0
166,apple-inc-help.com,1,18,5,4,3,0.957427,0,0,1,...,1,0,0,0,0,0,0,1,1,0


In [115]:
df.to_csv('cleaned_data.csv', index = False)