In [1]:
# importing required packages
import pandas as pd

## Loading the raw data

In [2]:
# used pandas to read the legitimate URLs as a data frame
legitimate_raw = pd.read_csv("datasets/legitimate.txt", header=None, names=['URLs'])

legitimate_raw.head()


Unnamed: 0,URLs
0,http://www.geocities.com/timessquare/arcade/78...
1,https://www.google.com
2,https://chat.openai.com/c/65223e58-8a2a-4b78-a...
3,https://www.figma.com/file/qcOD3dWSoECp0SyiDdc...
4,https://www.bing.com/translator


In [4]:
legitimate_raw.shape

(25, 1)

In [8]:
# reading the phishing URLs as a data frame
phish_url = pd.read_csv("datasets/phishing.txt", header=None, names=['URLs'])

phish_url.head()


Unnamed: 0,URLs
0,http://orgcontalex.com.br/acesso.criptografado...
1,http://www.verification-mobile-nab.com/cgi/e8e...
2,http://ms-0ffice365supportplanquota.us.com0203...
3,http://yahoo.co.in/email_open_log_pic.php?mid=...
4,https://www.microsoft@53.com/en-us/bing?form=M...


Feature Extraction

In this step, features are extracted from the URLs dataset.

The extracted features are categorized into

Address Bar based Features
Domain based Features
HTML & Javascript based Features

Many features can be extracted that can be consided as address bar base features. Out of them, below mentioned were considered for this project.

Domain of URL
IP Address in URL
"@" Symbol in URL
Length of URL
Depth of URL
Redirection "//" in URL
"http/https" in Domain name
Using URL Shortening Services “TinyURL”
Prefix or Suffix "-" in Domain

In [9]:
# importing required packages
from urllib.parse import urlparse, urlencode
import ipaddress
import re

In [10]:
# 1. Domain of the URL (Domain) 
def getDomain(url):  
  domain = urlparse(url).netloc
  if re.match(r"^www.", domain):
        domain = domain.replace("www.", "")
        return domain

In [11]:
# 2. Checks for IP address in URL (Have_IP)

# If the domain part of URL has IP address, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).

def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip


In [12]:
# 3.Checks the presence of @ in URL (Have_At)

def have_At_Sign(url):
  if "@" in url:
    at = 1    
  else:
    at = 0    
  return at


In [13]:
# 4. Finding the length of URL and categorizing (URL_Length)

def getLength(url):
  if len(url) < 54:
    length = 0          # legitimate
  elif len(url) >= 54 and len(url) <= 75:
    return 2            # suspicious
  else:
    length = 1          # phishing  
  return length

In [14]:
# 5. Checking for redirection '//' in the url (Redirection)

def redirection(url):
  if "//" in urlparse(url).path:
    return 1            # phishing
  else:
    return 0            # legitimate
#   pos = url.rfind('//')
#   if pos > 6:
#     if pos > 7:
#       return 1
#     else:
#       return 0
#   else:
#     return 0

In [15]:
# 6. Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)

def httpDomain(url):
    match=re.search('https://|http://',url)
    try:
        if match.start(0)==0 and match.start(0) is not None:
            url=url[match.end(0):]
            match=re.search('http|https',url)
            if match:
                return 1
            else:
                return 0
    except:
        return 1
#   domain = urlparse(url).netloc
#   if 'https' in domain:
#     return 1
#   else:
#     return 0

In [16]:
# 7. Checking for Shortening Services in URL (Tiny_URL)

# shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
#                       r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
#                       r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
#                       r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
#                       r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
#                       r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
#                       r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
#                       r"tr\.im|link\.zip\.net"

def tinyURL(url):
    match=re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                    'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                    'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                    'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                    'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                    'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                    'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net',url)
    if match:
        return 1               # phishing
    else:
        return 0               # legitimate
     
    # match=re.search(shortening_services,url)
    # if match:
    #     return 1
    # else:
    #     return 0

In [17]:
# 8. Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)

def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate

Domain Based Features

In [18]:
# importing required packages for this section
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

In [19]:
# 9. DNS

In [20]:
# 10. Web traffic (Web_Traffic)

def web_traffic(url):
  try:
    #Filling the whitespaces in the URL if any
    url = urllib.parse.quote(url)
    rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
    rank = int(rank)
  except TypeError:
        return 1
  if rank <100000:
    return 1
  else:
    return 0

In [21]:
# 11. Survival time of domain: The difference between termination time and creation time (Domain_Age)  

def domainAge(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
      return 1
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
    if ((ageofdomain/30) < 6):
      age = 1
    else:
      age = 0
  return age

In [22]:
# 12. End time of domain: The difference between termination time and current time (Domain_End) 

def domainEnd(domain_name):
  expiration_date = domain_name.expiration_date
  if isinstance(expiration_date,str):
    try:
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if (expiration_date is None):
      return 1
  elif (type(expiration_date) is list):
      return 1
  else:
    today = datetime.now()
    end = abs((expiration_date - today).days)
    if ((end/30) < 6):
      end = 0
    else:
      end = 1
  return end

HTML/JS Based Features

In [23]:
# importing required packages for this section

import requests

In [24]:
# 13. IFrame Redirection (iFrame)

def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[|]", response.text):
          return 0
      else:
          return 1

In [25]:
# 14. Checks the effect of mouse over on status bar (Mouse_Over)

def mouseOver(response): 
  if response == "" :
    return 1
  else:
    if re.findall("", response.text):
      return 1
    else:
      return 0

In [26]:
# 15. Checks the status of the right click attribute (Right_Click)

def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1

In [27]:
# 16. Checks the number of forwardings (Web_Forwards)

def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1

Computing URL Function

In [44]:

# Function to extract features

def featureExtraction(url, label):

  features = []

  #Address bar based features (8)

  features.append(getDomain(url))
  features.append(havingIP(url))
  features.append(have_At_Sign(url))
  features.append(getLength(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))
  
  #Domain based features (3)

  dns = 0
  try:
    domain_name = whois.whois(urlparse(url).netloc)
  except:
    dns = 1

  features.append(dns)
  # features.append(web_traffic(url))
  features.append(1 if dns == 1 else domainAge(domain_name))
  features.append(1 if dns == 1 else domainEnd(domain_name))

  # HTML & Javascript based features (4)
  
  try:
    response = requests.get(url)
  except:
    response = ""
  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(rightClick(response))
  features.append(forwarding(response))
  features.append(label)

  return features

Extracting Features of Legitimate URLs

In [31]:
legitimate_raw.shape


(25, 1)

In [45]:
# Extracting the feautres & storing them in a list

leg_feat = []
label = 0
rows = len(legitimate_raw['URLs'])
print("rows :", rows)

for i in range(0, rows):
  url = legitimate_raw['URLs'][i]
  print("I:", i), print(url)
  leg_feat.append(featureExtraction(url, label))

rows : 25
I: 0
http://www.geocities.com/timessquare/arcade/7810/p_hgames.htm
I: 1
https://www.google.com
I: 2
https://chat.openai.com/c/65223e58-8a2a-4b78-a3f6-f5b86d54172f
I: 3
https://www.figma.com/file/qcOD3dWSoECp0SyiDdclX1/PrudentSalesApp_Draft-(Copy)?type=design&node-id=4012-14533&mode=design
I: 4
https://www.bing.com/translator
I: 5
https://www.google.com/search?q=bing&rlz=1C1RLNS_enNG810NG810&oq=bing&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQRRg7MgYIAhBFGDvSAQgxMTAyajBqN6gCALACAA&sourceid=chrome&ie=UTF-8
I: 6
https://www.microsoft.com/en-us/bing?form=MA13FV
I: 7
http://www.three-musketeers.net/mike/animeopinions.html
I: 8
https://www.godaddy.com/forsale/gle.com?utm_source=TDFS_BINNS&utm_medium=BINNS&utm_campaign=TDFS_BINNS&traffic_type=TDFS_BINNS&traffic_id=binns&
I: 9
https://getbootstrap.com/docs/5.3/forms/form-control/
I: 10
https://mail.google.com/mail/u/0/?tab=rm&ogbl#inbox
I: 11
https://mail.tribearc.com/l/2srryYOstLeTrfmZAXoBaQ/Osc57fcpHJ3YwAx763JGhn5A/ZCpNNjEokgYjmJRxDGBhRg
I

In [46]:
#converting the list to dataframe

feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 
                      'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

url_leg = pd.DataFrame(leg_feat, columns= feature_names)
url_leg.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,geocities.com,0,0,2,0,0,0,0,1,1,1,0,1,1,1,0
1,google.com,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0
2,,0,0,2,0,0,0,0,1,1,1,0,1,1,0,0
3,figma.com,0,0,1,0,0,0,0,1,1,1,0,1,1,0,0
4,bing.com,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0


In [47]:
# Storing the extracted legitimate URLs fatures to csv file

url_leg.to_csv('legitimate_urls.csv', index= False)

Extracting Features of Phishing URLs

In [48]:
# Extracting the feautres & storing them in a list

phish_feat = []
label = 1
rows = len(phish_url['URLs'])
print("rows :", rows)

for i in range(0, rows):
  url = phish_url['URLs'][i]
  print("I:", i), print(url)
  phish_feat.append(featureExtraction(url,label))


rows : 25
I: 0
http://orgcontalex.com.br/acesso.criptografado2/cliente.conectado1/pessoa.fisica.recadastrar/cadastrar.referencia/acesso.pendente.comunicacao/index.html
I: 1
http://www.verification-mobile-nab.com/cgi/e8e11a52ed442dc4b327eaa2f19c2521/login/
I: 2
http://ms-0ffice365supportplanquota.us.com0203499083934.elegantvapour.co.uk/0-009021--0203499083934.l3992389100283901233232400349902132443940090132/hotmail.login/client_id=00000002-0000-0ff1-ce00-000000000000&amp;/000-pc.domain/officelogin1.html?run=login_cmd&amp;statuts=f17ca2c829680ada2fec9fc87bc5f606913063b5e3c451761e1f9d5888ada562
I: 3
http://yahoo.co.in/email_open_log_pic.php?mid=9f8fd3e2a108a256bff453d09c965c25&amp;s=a
I: 4
https://www.microsoft@53.com/en-us/bing?form=MA13FV
I: 5
http://anniey06l5.myjino.ru/byte/Cookies/Cookies/Pelewura/onlineverifica@tion/Preview.htm
I: 6
https://forms.zohopublic.com/koogee/form/ACTIVATIONFORM//formperma/g51Bb2g53DaChBA4be0h2kCA_
I: 7
http://store2.apple.ch.email-forwarding.webstore.websis

In [49]:
# converting the list to dataframe

feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record',
                      'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

url_phish = pd.DataFrame(phish_feat, columns= feature_names)
url_phish.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,,0,0,1,0,0,1,0,1,1,1,1,1,1,1,1
1,verification-mobile-nab.com,0,0,1,0,0,0,1,1,1,1,1,1,1,1,1
2,,0,0,1,0,0,0,1,1,1,1,1,1,1,0,1
3,,0,0,1,0,0,0,0,1,1,1,0,1,1,1,1
4,microsoft@53.com,0,1,0,0,0,0,0,1,1,1,1,1,1,0,1


In [50]:
# Storing the extracted legitimate URLs fatures to csv file

url_phish.to_csv('phishing_urls.csv', index= False)

Final Dataset

In [51]:
# Concatenating the dataframes into one

leg_phish_data = pd.concat([url_leg, url_phish]).reset_index(drop=True)
leg_phish_data.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,geocities.com,0,0,2,0,0,0,0,1,1,1,0,1,1,1,0
1,google.com,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0
2,,0,0,2,0,0,0,0,1,1,1,0,1,1,0,0
3,figma.com,0,0,1,0,0,0,0,1,1,1,0,1,1,0,0
4,bing.com,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0


In [52]:
leg_phish_data.tail()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
45,,0,0,1,0,0,0,0,1,1,1,1,1,1,1,1
46,,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1
47,,0,0,1,0,0,0,0,1,1,1,1,1,1,1,1
48,,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1
49,,0,0,1,0,1,0,1,1,1,1,1,1,1,1,1


In [53]:
leg_phish_data.shape

(50, 16)

In [54]:
# Storing the data in CSV file

leg_phish_data.to_csv('leg_phish_urls.csv', index=False)