# Malicious URL Detection (Data Collection & Feature Extraction)
<p>
The malicious urls can be detected using the lexical features along with tokenization of the url strings. WE aim to build a basic binary classifier which would help classify the URLs as malicious or benign.
</p>

## 1. Data Collection

In [1]:
import pandas as pd

In [2]:
f_path = "../data/balanced_urls.csv"
df = pd.read_csv(f_path)

In [3]:
df.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 632508 entries, 0 to 632507
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     632508 non-null  object
 1   label   632508 non-null  object
 2   result  632508 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 14.5+ MB


In [5]:
df.shape

(632508, 3)

In [6]:
# Printing number of legit and fraud domain urls (0-benign, 1-phishing)
df["label"].value_counts()
df["result"].value_counts()

result
0    316254
1    316254
Name: count, dtype: int64

## 2. DATA PREPROCESSING
The following features will be extracted from the URL for classification.

<ol>
<li>
    Length Features
    <ul>
        <li>Length Of Url</li>
        <li>Length of Hostname</li>
        <li>Length Of Path</li>
        <li>Length Of First Directory</li>
    </ul>
</li>
   <br> 
<li>
    Count Features
    <ul>
        <li>Count Of '-'</li>
        <li>Count Of '@'</li>
        <li>Count Of '?'</li>
        <li>Count Of '%'</li>
        <li>Count Of '.'</li>
        <li>Count Of '='</li>
        <li>Count Of 'http'</li>
        <li>Count Of 'www'</li>
        <li>Count Of Digits</li>
        <li>Count Of Letters</li>
        <li>Count Of Number Of Directories</li>
    </ul>
</li>
    <br>
<li>
    Binary Features
    <ul>
        <li>Use of IP or not</li>
        <li>Use of Shortening URL or not</li>
    </ul>
</li>
</ol>

In [7]:
#Importing dependencies
from urllib.parse import urlparse
import os.path
import ipaddress

# changing dataframe variable
df = df.drop(["label"], axis=1)
url_data = df
url_data = url_data.rename(columns = {'result':'label'})

### 2.1 Length Features

In [8]:
#Length of URL (Phishers can use long URL to hide the doubtful part in the address bar)
url_data['url_length'] = url_data['url'].apply(lambda i: len(str(i)))

#Hostname Length
url_data['hostname_length'] = url_data['url'].apply(lambda i: len(urlparse(i).netloc))

#Path Length
url_data['path_length'] = url_data['url'].apply(lambda i: len(urlparse(i).path))

In [9]:
#First Directory Length
def fd_length(url):
    url_path= urlparse(url).path
    try:
        return len(url_path.split('/')[1])
    except:
        return 0

url_data['fd_length'] = url_data['url'].apply(lambda i: fd_length(i))

In [10]:
# printing first few rows
url_data.head(10)

Unnamed: 0,url,label,url_length,hostname_length,path_length,fd_length
0,https://www.google.com,0,22,14,0,0
1,https://www.youtube.com,0,23,15,0,0
2,https://www.facebook.com,0,24,16,0,0
3,https://www.baidu.com,0,21,13,0,0
4,https://www.wikipedia.org,0,25,17,0,0
5,https://www.reddit.com,0,22,14,0,0
6,https://www.yahoo.com,0,21,13,0,0
7,https://www.google.co.in,0,24,16,0,0
8,https://www.qq.com,0,18,10,0,0
9,https://www.amazon.com,0,22,14,0,0


### 2.2 Count Features

In [11]:
# Count of how many times a special character appearsin url

url_data['count@'] = url_data['url'].apply(lambda i: i.count('@'))

url_data['count?'] = url_data['url'].apply(lambda i: i.count('?'))

url_data['count%'] = url_data['url'].apply(lambda i: i.count('%'))

url_data['count.'] = url_data['url'].apply(lambda i: i.count('.'))

url_data['count='] = url_data['url'].apply(lambda i: i.count('='))

url_data['count-http'] = url_data['url'].apply(lambda i : i.count('http'))

url_data['count-https'] = url_data['url'].apply(lambda i : i.count('https'))

url_data['count-www'] = url_data['url'].apply(lambda i: i.count('www'))

In [12]:
def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits
url_data['count-digits']= url_data['url'].apply(lambda i: digit_count(i))

In [13]:
def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters
url_data['count-letters']= url_data['url'].apply(lambda i: letter_count(i))

In [14]:
def no_of_dir(url):
    url_dir = urlparse(url).path
    return url_dir.count('/')
url_data['count_dir'] = url_data['url'].apply(lambda i: no_of_dir(i))

In [15]:
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0

url_data['count_redirection'] = url_data['url'].apply(lambda i: redirection(i))

In [16]:
# printing first few rows
url_data.head(10)

Unnamed: 0,url,label,url_length,hostname_length,path_length,fd_length,count@,count?,count%,count.,count=,count-http,count-https,count-www,count-digits,count-letters,count_dir,count_redirection
0,https://www.google.com,0,22,14,0,0,0,0,0,2,0,1,1,1,0,17,0,0
1,https://www.youtube.com,0,23,15,0,0,0,0,0,2,0,1,1,1,0,18,0,0
2,https://www.facebook.com,0,24,16,0,0,0,0,0,2,0,1,1,1,0,19,0,0
3,https://www.baidu.com,0,21,13,0,0,0,0,0,2,0,1,1,1,0,16,0,0
4,https://www.wikipedia.org,0,25,17,0,0,0,0,0,2,0,1,1,1,0,20,0,0
5,https://www.reddit.com,0,22,14,0,0,0,0,0,2,0,1,1,1,0,17,0,0
6,https://www.yahoo.com,0,21,13,0,0,0,0,0,2,0,1,1,1,0,16,0,0
7,https://www.google.co.in,0,24,16,0,0,0,0,0,3,0,1,1,1,0,18,0,0
8,https://www.qq.com,0,18,10,0,0,0,0,0,2,0,1,1,1,0,13,0,0
9,https://www.amazon.com,0,22,14,0,0,0,0,0,2,0,1,1,1,0,17,0,0


### 2.3 Binary Features

In [17]:
import re

#Use of IP or not in domain
def having_ip_address(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)  # Ipv6
    if match:
        # print match.group()
        return -1
    else:
        # print 'No matching pattern found'
        return 1
url_data['use_of_ip'] = url_data['url'].apply(lambda i: having_ip_address(i))

In [18]:
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0
    
url_data['prefix-Suffix'] = url_data['url'].apply(lambda i: prefixSuffix(i))    

In [19]:
# use of url shortening service
def shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return -1
    else:
        return 1
url_data['short_url'] = url_data['url'].apply(lambda i: shortening_service(i))

In [20]:
# printing first few rows
url_data.head(10)

Unnamed: 0,url,label,url_length,hostname_length,path_length,fd_length,count@,count?,count%,count.,...,count-http,count-https,count-www,count-digits,count-letters,count_dir,count_redirection,use_of_ip,prefix-Suffix,short_url
0,https://www.google.com,0,22,14,0,0,0,0,0,2,...,1,1,1,0,17,0,0,1,0,1
1,https://www.youtube.com,0,23,15,0,0,0,0,0,2,...,1,1,1,0,18,0,0,1,0,1
2,https://www.facebook.com,0,24,16,0,0,0,0,0,2,...,1,1,1,0,19,0,0,1,0,1
3,https://www.baidu.com,0,21,13,0,0,0,0,0,2,...,1,1,1,0,16,0,0,1,0,1
4,https://www.wikipedia.org,0,25,17,0,0,0,0,0,2,...,1,1,1,0,20,0,0,1,0,1
5,https://www.reddit.com,0,22,14,0,0,0,0,0,2,...,1,1,1,0,17,0,0,1,0,-1
6,https://www.yahoo.com,0,21,13,0,0,0,0,0,2,...,1,1,1,0,16,0,0,1,0,1
7,https://www.google.co.in,0,24,16,0,0,0,0,0,3,...,1,1,1,0,18,0,0,1,0,1
8,https://www.qq.com,0,18,10,0,0,0,0,0,2,...,1,1,1,0,13,0,0,1,0,1
9,https://www.amazon.com,0,22,14,0,0,0,0,0,2,...,1,1,1,0,17,0,0,1,0,1


In [21]:
url_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 632508 entries, 0 to 632507
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   url                632508 non-null  object
 1   label              632508 non-null  int64 
 2   url_length         632508 non-null  int64 
 3   hostname_length    632508 non-null  int64 
 4   path_length        632508 non-null  int64 
 5   fd_length          632508 non-null  int64 
 6   count@             632508 non-null  int64 
 7   count?             632508 non-null  int64 
 8   count%             632508 non-null  int64 
 9   count.             632508 non-null  int64 
 10  count=             632508 non-null  int64 
 11  count-http         632508 non-null  int64 
 12  count-https        632508 non-null  int64 
 13  count-www          632508 non-null  int64 
 14  count-digits       632508 non-null  int64 
 15  count-letters      632508 non-null  int64 
 16  count_dir          6

In [22]:
#Save the data
url_data.to_csv("URL_Processed.csv")