In [None]:
import pandas as pd
import re
from math import log2
from urllib.parse import urlparse
from sklearn.preprocessing import LabelEncoder
from googlesearch import search
import string
import hashlib

In [None]:
from tld import get_tld

In [None]:
from tldextract import extract
import tldextract

In [None]:
url_data = pd.read_csv('/content/drive/MyDrive/javas_project/data/malicious_phish.csv')

In [None]:
url_data['type'].value_counts()

In [None]:
lb_make = LabelEncoder()
url_data["type_code"] = lb_make.fit_transform(url_data["type"])
url_data["type_code"].value_counts()

In [None]:
def calculate_entropy(s):
    prob = [float(s.count(c)) / len(s) for c in set(s)]
    entropy = - sum([p * log2(p) for p in prob if p > 0])
    return entropy

def analyze_domain(row):
    url = row['url'].replace('www.', '')
    prefixes = ['http://', 'https://']
    for prefix in prefixes:
        if url.startswith(prefix):
            url = url[len(prefix):]

    length = len(url)
    word_count = len(re.findall(r'\w+', url))
    entropy = calculate_entropy(url)

    return length, word_count, entropy

url_data['DomainLen'], url_data['WordCnt'], url_data['Entropy'] = zip(*url_data.apply(analyze_domain, axis=1))

In [None]:
def has_ip(url):
    ipv4_pattern = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
    ipv6_pattern = re.compile(r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b')

    if ipv4_pattern.search(url) or ipv6_pattern.search(url):
        return 1
    else:
        return 0

url_data['use_ip'] = url_data['url'].apply(has_ip)

In [None]:
def extr_pri_domain(url):
    try:
        res = get_tld(url, as_object = True, fail_silently=False,fix_protocol=True)
        pri_domain= res.parsed_url.netloc
    except :
        pri_domain= None
    return pri_domain

url_data['pri_domain'] = url_data['url'].apply(lambda i: extr_pri_domain(i))

In [None]:
def count_letters(url):
    num_letters = sum(char.isalpha() for char in url)
    return num_letters

def count_digits(url):
    num_digits = sum(char.isdigit() for char in url)
    return num_digits

def count_special_chars(url):
    special_chars = set(string.punctuation)
    num_special_chars = sum(char in special_chars for char in url)
    return num_special_chars

url_data['lets_cnt'] = url_data['url'].apply(lambda i: count_letters(i))
url_data['dgts_cnt'] = url_data['url'].apply(lambda i: count_digits(i))
url_data['spl_chars_cnt'] = url_data['url'].apply(lambda i: count_special_chars(i))

In [None]:
def count_occurrences(url, substr):
    return url.count(substr)

def count_embed(url):
    return urlparse(url).path.count('//')

url_data['cnt_http'] = url_data['url'].apply(lambda i: count_occurrences(i, 'http'))
url_data['cnt_https'] = url_data['url'].apply(lambda i: count_occurrences(i, 'https'))
url_data['cnt_domain'] = url_data['url'].apply(count_embed)
url_data['cnt_dir'] = url_data['url'].apply(lambda i: urlparse(i).path.count('/'))
url_data['cnt_www'] = url_data['url'].apply(lambda i: count_occurrences(i, 'www'))
url_data['cnt_dot'] = url_data['url'].apply(lambda i: count_occurrences(i, '.'))
url_data['cnt_at'] = url_data['url'].apply(lambda i: count_occurrences(i, '@'))
url_data['cnt_per'] = url_data['url'].apply(lambda i: count_occurrences(i, '%'))
url_data['cnt_ques'] = url_data['url'].apply(lambda i: count_occurrences(i, '?'))
url_data['cnt_hyph'] = url_data['url'].apply(lambda i: count_occurrences(i, '-'))
url_data['cnt_eq'] = url_data['url'].apply(lambda i: count_occurrences(i, '='))

In [None]:
def shortn_svc(url, shortening_services):
    pattern = re.compile(r'https?://(?:www\.)?(?:\w+\.)*(\w+)\.\w+')
    match = pattern.search(url)

    if match:
        domain = match.group(1)
        if domain.lower() in shortening_services:
            return 1
    return 0

common_shortening_services = ['bit', 'goo', 'tinyurl', 'ow', 't', 'is',
                                      'cli', 'yfrog', 'migre', 'ff', 'url4', 'twit',
                                      'su', 'snipurl', 'short', 'BudURL', 'ping',
                                      'post', 'Just', 'bkite', 'snipr', 'fic',
                                      'loopt', 'doiop', 'short', 'kl', 'wp',
                                      'rubyurl', 'om', 'to', 'bit', 't', 'lnkd',
                                      'db', 'qr', 'adf', 'goo', 'bitly', 'cur',
                                      'tinyurl', 'ow', 'bit', 'ity', 'q', 'is',
                                      'po', 'bc', 'twitthis', 'u', 'j', 'buzurl',
                                      'cutt', 'u', 'yourls', 'x', 'prettylinkpro',
                                      'scrnch', 'filoops', 'vzturl', 'qr', '1url',
                                      'tweez', 'v', 'tr', 'link', 'zip']
url_data['short_url'] = url_data['url'].apply(lambda i: shortn_svc(i, common_shortening_services))

In [None]:
def ab_url(url):
    parsed_url = urlparse(url)
    netloc = parsed_url.netloc
    if netloc:
        netloc = str(netloc)
        if netloc == parsed_url.netloc:
            return 1
    return 0

url_data['ab_url'] = url_data['url'].apply(lambda i: ab_url(i))

In [None]:
def secure_http(url):
    return int(urlparse(url).scheme == 'https')

url_data['secure_http'] = url_data['url'].apply(lambda i: secure_http(i))

In [None]:
url_data.isnull().sum()

In [None]:
url_data.fillna(0,inplace=True)

In [None]:
def extract_root_domain(url):
    extracted = tldextract.extract(url)
    root_domain = extracted.domain
    return root_domain
url_data['root_domain'] = url_data['pri_domain'].apply(lambda i: extract_root_domain(str(i)))

In [None]:
url_data['root_domain'].value_counts()

In [None]:
data = url_data.drop(columns=['url','type','pri_domain'])

In [None]:
data['root_domain'].value_counts()

In [None]:
data = data[data['root_domain'] != '0']

In [None]:
def hash_encode(category):
    hash_object = hashlib.md5(category.encode())
    return int(hash_object.hexdigest(), 16) % (10 ** 8)
data['root_domain'] = data['root_domain'].apply(hash_encode)

In [None]:
data = data.reset_index(drop=True)

In [None]:
data.info()

In [None]:
data

In [None]:
data.corr()