# 1. Imports

In [37]:
import numpy as np
import pandas as pd
import os
import math
import whois
import re
from tld import get_tld
from urllib.parse import urlparse
from datetime import datetime, timezone
from requests import get
from pyquery import PyQuery
from sklearn.preprocessing import LabelEncoder


# 2. Data Loading

In [54]:
parent_dir = os.path.dirname(os.getcwd())
final_path = os.path.join(parent_dir, 'dataset', 'malicious_phish.csv')
df = pd.read_csv(final_path)
print(df.shape)
print(df.head())
print(df.type.value_counts())

(651191, 2)
                                                 url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: type, dtype: int64


# 3. Obtain Features

In [14]:
'''
The feature extraction code references the article "Predicting the Maliciousness of URLs"
written by Ruth Eneyi Ikwu.
'''
class URL_Extractor:
    '''
    Extracts useful features from a single URL. Three types of features are processed:\n
        -string lexical characteristics \n
        -domain characteristics (whois information) \n
        -page content characteristics (from the GET request)
    '''

    def __init__(self, url):
        self.url = url
        # Remove the possible http:// part, and the part after the /
        self.domain = url.split('//')[-1].split('/')[0]
        # if http:// is not included, we should concatenate it to the front of the string 
        if "http" not in url:
            self.query_url = "http://" + url
        else:
            self.query_url = url
        try:
            self.whois = whois.whois(self.domain).__dict__
        except:
            #print("Failed to obtain domain whois information for {}".format(self.domain))
            self.whois = None
        try:
            # The requests.Response() Object contains the server's response to the HTTP request.
            self.response = get(self.query_url)
        except:
            #print("Failed to obtain response from URL for {}".format(self.query_url))
            self.response = None    

        if self.response is not None:    
            try:
                # The PyQuery object allows us to make jQuery queries on the XML from the response
                self.pq = PyQuery(self.response.text)
            except:
                #print("Failed to create PyQuery object from response's XML for {}".format(self.query_url))
                self.pq = None
        else:
            self.pq = None  

    # String lexical features.   
    def entropy(self):
        '''
        Calculate the Shannon Entropy of a character in this URL string, \n
        or the average level of uncertainty inherent in the character's possible outcomes. 
        '''
        string = self.url.strip()
        # The probability of each character appearing in the string based on count
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        entropy = -sum([(p * math.log(p) / math.log(2.0)) for p in prob])
        return entropy

    def numDigits(self):
        digits = [i for i in self.url if i.isdigit()]
        return len(digits)

    def urlLength(self):
        return len(self.url)
    
    def numParameters(self):
        params = self.url.split('&')
        return len(params) - 1
    
    def numFragments(self):
        fragments = self.url.split('#')
        return len(fragments) - 1
    
    def numSubDomains(self):
        subdomains = self.url.split('http')[-1].split('//')[-1].split('/')
        return len(subdomains)-1
    
    def domainExtension(self):
        ext = self.url.split('.')[-1].split('/')[0]
        return ext
    
    # Domain features
    def hasHttp(self):
        return self.url.startswith('http:')

    def hasHttps(self):
        return self.url.startswith('https:')

    def urlIsLive(self):
        return self.response == 200
    
    # Page features
    def bodyLength(self):
        if self.pq is not None:
            return len(self.pq('html').text()) if self.urlIsLive else 0
        else:
            return 0

    def numTitles(self):
        if self.pq is not None:
            titles = ['h{}'.format(i) for i in range(7)]
            titles = [self.pq(i).items() for i in titles]
            return len([item for s in titles for item in s])
        else:
            return 0

    def numImages(self):
        if self.pq is not None:
            return len([i for i in self.pq('img').items()])
        else:
            return 0

    def numLinks(self):
        if self.pq is not None:
            return len([i for i in self.pq('a').items()])
        else:
            return 0

    def scriptLength(self):
        if self.pq is not None:
            return len(self.pq('script').text())
        else:
            return 0

    def specialCharacters(self):
        if self.pq is not None:
            bodyText = self.pq('html').text()
            schars = [i for i in bodyText if not i.isdigit() and not i.isalpha()]
            return len(schars)
        else:
            return 0

    def scriptToSpecialCharsRatio(self):
        if self.pq is not None:
            sscr = self.scriptLength()/self.specialCharacters()
        else:
            sscr = 0
        return sscr

    def scriptTobodyRatio(self):
        if self.pq is not None:
            sbr = self.scriptLength()/self.bodyLength()
        else:
            sbr = 0
        return sbr

    def bodyToSpecialCharRatio(self):
        if self.pq is not None:
            bscr = self.specialCharacters()/self.bodyLength()
        else:
            bscr = 0
        return bscr
    
    def run(self):
        data = {}
        data['entropy'] = self.entropy()
        data['numDigits'] = self.numDigits()
        data['urlLength'] = self.urlLength()
        data['numParams'] = self.numParameters()
        data['hasHttp'] = self.hasHttp()
        data['hasHttps'] = self.hasHttps()
        data['urlIsLive'] = self.urlIsLive()
        data['bodyLength'] = self.bodyLength()
        data['numTitles'] = self.numTitles()
        data['numImages'] = self.numImages()
        data['numLinks'] = self.numLinks()
        data['scriptLength'] = self.scriptLength()
        data['specialChars'] = self.specialCharacters()
        data['ext'] = self.domainExtension()
        data['sscr'] = self.scriptToSpecialCharsRatio()
        data['sbr'] = self.scriptTobodyRatio()
        data['bscr'] = self.bodyToSpecialCharRatio()
        return data

In [42]:
'''
References Kaggle notebook by dataset author at:
https://www.kaggle.com/sid321axn/malicious-url-detection-using-ml-feat-engg#Plotting-ensemble-Feature-Importance
'''

def suspicious_words(url):
    '''Tries to find manually-listed suspicous words in the url'''
    match = re.search('PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr',url)
    if match:
        return 1
    else:
        return 0

def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')

def no_of_embed(url):
    urldir = urlparse(url).path
    return urldir.count('//')

def having_ip_address(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)  # Ipv6
    if match:
        return 1
    else:
        return 0

def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    if match:
        return 1
    else:
        return 0

def shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0

def fd_length(url):
    urlpath= urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0

def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1        
        
def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits

def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters

In [40]:
def create_df_with_features(df):    
    df['use_of_ip'] = df['url'].apply(lambda i: having_ip_address(i))
    print("Finished ip")

    df['abnormal_url'] = df['url'].apply(lambda i: abnormal_url(i))
    print("Finished abnormal url")

    df['count_.'] = df['url'].apply(lambda i: i.count('.'))
    print("Finished count_.")

    df['count_www'] = df['url'].apply(lambda i: i.count('www'))
    print("Finished count_www.")

    df['count_@'] = df['url'].apply(lambda i: i.count('@'))
    print("Finished count_@")

    df['count_dir'] = df['url'].apply(lambda i: no_of_dir(i))
    print("Finished count_dir")

    df['count_embed_domain'] = df['url'].apply(lambda i: no_of_embed(i))
    print("Finished count_embed_domain")

    df['short_url'] = df['url'].apply(lambda i: shortening_service(i))
    print("Finished short_url")

    df['count_https'] = df['url'].apply(lambda i : i.count('https:'))
    print("Finished count_https")

    df['count_http'] = df['url'].apply(lambda i : i.count('http:'))
    print("Finished count_http")

    df['count_%'] = df['url'].apply(lambda i: i.count('%'))
    print("Finished count_%")
    
    df['count_?'] = df['url'].apply(lambda i: i.count('?'))
    print("Finished count_?")

    df['count_'] = df['url'].apply(lambda i: i.count('-'))
    print("Finished count_'")

    df['count_='] = df['url'].apply(lambda i: i.count('='))
    print("Finished count_=")

    df['url_length'] = df['url'].apply(lambda i: len(str(i)))
    print("Finished url_length")

    df['hostname_length'] = df['url'].apply(lambda i: len(urlparse(i).netloc))
    print("Finished hostname_length")

    df['sus_url'] = df['url'].apply(lambda i: suspicious_words(i))
    print("Finished sus_url")

    df['fd_length'] = df['url'].apply(lambda i: fd_length(i))
    print("Finished fd_length")

    df['tld'] = df['url'].apply(lambda i: get_tld(i,fail_silently=True))
    print("Finished tld")

    df['tld_length'] = df['tld'].apply(lambda i: tld_length(i))
    print("Finished tld_length")

    df = df.drop("tld",1)

    df['count_digits']= df['url'].apply(lambda i: digit_count(i))
    print("Finihed count_digits")

    df['count_letters']= df['url'].apply(lambda i: letter_count(i))
    print("Finished count_letters")

In [43]:
df_with_features = df.copy()
create_df_with_features(df_with_features)
df_with_features.head()

Finished ip
Finished abnormal url
Finished count_.
Finished count_www.
Finished count_@
Finished count_dir
Finished count_embed_domain
Finished short_url
Finished count_https
Finished count_http
Finished count_%
Finished count_?
Finished count_'
Finished count_=
Finished url_length
Finished hostname_length
Finished sus_url
Finished fd_length
Finished tld
Finished tld_length


  df = df.drop("tld",1)


Finihed count_digits
Finished count_letters
Finished type_code


Unnamed: 0,url,type,use_of_ip,abnormal_url,count_.,count_www,count_@,count_dir,count_embed_domain,short_url,...,count_%,count_?,count_,count_=,url_length,hostname_length,sus_url,fd_length,tld,tld_length
0,br-icloud.com.br,phishing,0,0,2,0,0,0,0,0,...,0,0,1,0,16,0,0,0,,-1
1,mp3raid.com/music/krizz_kaliko.html,benign,0,0,2,0,0,2,0,0,...,0,0,0,0,35,0,0,5,,-1
2,bopsecrets.org/rexroth/cr/1.htm,benign,0,0,2,0,0,3,0,0,...,0,0,0,0,31,0,0,7,,-1
3,http://www.garage-pirenne.be/index.php?option=...,defacement,0,1,3,1,0,1,0,0,...,0,1,1,4,88,21,0,9,be,2
4,http://adventure-nicaragua.net/index.php?optio...,defacement,0,1,2,0,0,1,0,0,...,0,1,1,3,235,23,0,9,net,3


In [53]:
lb_make = LabelEncoder()
df_with_features["type_code"] = lb_make.fit_transform(df_with_features["type"])
print("Finished type_code")
df_with_features['type_code'] = df['type_code'].astype(np.int16)
print(df_with_features['type_code'].head())

Finished type_code
0    3
1    0
2    0
3    1
4    1
Name: type_code, dtype: int16


In [55]:
df_with_features.to_csv(os.path.join(parent_dir, 'dataset', 'features.csv'))

In [56]:
features_path = os.path.join([parent_dir], 'dataset', 'features.csv')
df_with_features = pd.read_csv(features_path)
print(df_with_features.shape)
print(df_with_features.head())
print(df_with_features.type_code.value_counts())

(651191, 23)
   Unnamed: 0                                                url        type  \
0           0                                   br-icloud.com.br    phishing   
1           1                mp3raid.com/music/krizz_kaliko.html      benign   
2           2                    bopsecrets.org/rexroth/cr/1.htm      benign   
3           3  http://www.garage-pirenne.be/index.php?option=...  defacement   
4           4  http://adventure-nicaragua.net/index.php?optio...  defacement   

   use_of_ip  abnormal_url  count_.  count_www  count_@  count_dir  \
0          0             0        2          0        0          0   
1          0             0        2          0        0          2   
2          0             0        2          0        0          3   
3          0             1        3          1        0          1   
4          0             1        2          0        0          1   

   count_embed_domain  ...  count_%  count_?  count_  count_=  url_length  \
0       