In [2]:
import requests
from urllib.parse import urlparse, parse_qs
import tldextract
from bs4 import BeautifulSoup
import re
import hashlib
from collections import Counter

def url_to_id(url):
    return int(hashlib.md5(url.encode()).hexdigest(), 16) % (10**8)

def extract_features(url):
    features = {}
    # Ensure URL has a scheme
    if not url.startswith(('http://', 'https://')):
        url = 'http://' + url
    parsed = urlparse(url)
    scheme, netloc, path, query = parsed.scheme, parsed.netloc, parsed.path, parsed.query
    ext = tldextract.extract(url)
    subdomain, domain, suffix = ext.subdomain, ext.domain, ext.suffix
    
    # URL string features
    features['id'] = url_to_id(url)
    features['NumDots'] = url.count('.') 
    features['SubdomainLevel'] = len(subdomain.split('.')) if subdomain else 0
    features['PathLevel'] = path.count('/') - (1 if path.startswith('/') else 0)
    features['UrlLength'] = len(url)
    features['NumDash'] = url.count('-')
    features['NumDashInHostname'] = netloc.count('-')
    features['AtSymbol'] = 1 if '@' in url else 0
    features['TildeSymbol'] = 1 if '~' in url else 0
    features['NumUnderscore'] = url.count('_')
    features['NumPercent'] = url.count('%')
    features['NumQueryComponents'] = len(parse_qs(query))
    features['NumAmpersand'] = url.count('&')
    features['NumHash'] = url.count('#')
    features['NumNumericChars'] = sum(c.isdigit() for c in url)
    features['NoHttps'] = 1 if scheme != 'https' else 0
    
    # Heuristic for random string in path
    random_flag = 0
    for part in [p for p in path.split('/') if p]:
        if len(part)>15 and re.search(r'[A-Za-z]', part) and re.search(r'[0-9]', part):
            random_flag = 1
            break
    features['RandomString'] = random_flag
    
    # IP address check
    features['IpAddress'] = 1 if re.match(r'^\d+\.\d+\.\d+\.\d+$', netloc) else 0
    # Domain in subdomain or path
    features['DomainInSubdomains'] = 0
    if subdomain:
        if (suffix and suffix in subdomain.split('.')) or (domain and domain in subdomain.split('.')):
            features['DomainInSubdomains'] = 1
    features['DomainInPaths'] = 1 if ((suffix and suffix in path) or (domain and domain in path)) else 0
    # HTTPS obfuscation
    features['HttpsInHostname'] = 1 if 'https' in netloc else 0
    # Lengths
    features['HostnameLength'] = len(netloc)
    features['PathLength'] = len(path)
    features['QueryLength'] = len(query)
    # Double slash in path
    features['DoubleSlashInPath'] = 1 if '//' in path else 0
    # Sensitive words
    words = ["secure","account","webscr","login","signin","banking","confirm","ebayisapi"]
    features['NumSensitiveWords'] = sum(word in url.lower() for word in words)
    
    # Fetch and parse HTML
    try:
        res = requests.get(url, timeout=5)
        res.raise_for_status()
        html = res.text.lower()
        soup = BeautifulSoup(html, 'html.parser')
    except Exception:
        soup = BeautifulSoup("", 'html.parser')
        html = ""
    
    # Placeholder for brand-name embedding
    features['EmbeddedBrandName'] = 0
    
    # Analyze hyperlinks
    anchors = soup.find_all('a', href=True)
    total_links = len(anchors)
    ext_links = null_self = ext_null_self = 0
    for a in anchors:
        href = a['href'].lower()
        parsed_href = urlparse(href if href.startswith('http') else '')
        href_dom = parsed_href.netloc
        # External link
        if href_dom and href_dom != netloc:
            ext_links += 1
            if href.startswith('#') or href.startswith('javascript:'):
                ext_null_self += 1
        # Null/self
        if href == "" or href.startswith('#') or href == url or href.startswith('file:') or href.startswith('javascript:'):
            null_self += 1
    features['PctExtHyperlinks'] = (ext_links/total_links) if total_links>0 else 0
    features['PctExtNullSelfRedirectHyperlinksRT'] = ext_null_self
    features['PctNullSelfRedirectHyperlinks'] = (null_self/total_links) if total_links>0 else 0
    
    # Analyze resource links (images, scripts, CSS)
    resources = soup.find_all(['img','script','link'])
    total_res = ext_res = 0
    for tag in resources:
        url_attr = tag.get('src') or tag.get('href') or ""
        if url_attr:
            total_res += 1
            res_dom = urlparse(url_attr if url_attr.startswith('http') else '').netloc
            if res_dom and res_dom != netloc:
                ext_res += 1
    features['PctExtResourceUrls'] = (ext_res/total_res) if total_res>0 else 0
    features['PctExtResourceUrlsRT'] = (ext_res/total_res) if total_res>0 else 0
    
    # Favicon
    favicon = soup.find('link', rel=lambda x: x and 'icon' in x.lower())
    if favicon and favicon.has_attr('href'):
        fav_dom = urlparse(favicon['href']).netloc
        features['ExtFavicon'] = 1 if (fav_dom and fav_dom != netloc) else 0
    else:
        features['ExtFavicon'] = 0
    
    # Form-related features
    forms = soup.find_all('form')
    insecure = rel = ext_form = abnormal = abnormal_ext = images_only = 0
    for form in forms:
        action = form.get('action','').strip().lower()
        act_parsed = urlparse(action if action.startswith('http') else '')
        act_dom = act_parsed.netloc
        if action and act_parsed.scheme == 'http':
            insecure = 1
        if not action or (act_dom=='' and not action.startswith(('javascript:','data:','blob:'))):
            rel = 1
        if act_dom and act_dom != netloc:
            ext_form = 1
        if action == "" or action.startswith('#') or action=='about:blank' or action.startswith('javascript:'):
            abnormal = 1
        if (act_dom and act_dom != netloc) or action=="" or action=='about:blank':
            abnormal_ext = 1
        inputs = form.find_all('input')
        text_inputs = [inp for inp in inputs if inp.get('type') in ['text','password','email','number','submit','button']]
        if inputs and not text_inputs:
            if all(inp.get('type')=='image' for inp in inputs):
                images_only = 1
    features['InsecureForms'] = insecure
    features['RelativeFormAction'] = rel
    features['ExtFormAction'] = ext_form
    features['AbnormalFormAction'] = abnormal
    features['AbnormalExtFormActionR'] = abnormal_ext
    features['SubmitInfoToEmail'] = 1 if soup.find('a', href=re.compile(r'^mailto:')) else 0
    
    # JavaScript-based features
    features['FakeLinkInStatusBar'] = 1 if re.search(r'onmouseover.+window\.status', html) else 0
    features['RightClickDisabled'] = 1 if re.search(r'on(contextmenu|mousedown|mouseup)', html) else 0
    features['PopUpWindow'] = 1 if re.search(r'window\.open', html) else 0
    
    # IFrame and Title
    features['IframeOrFrame'] = 1 if soup.find(['frame','iframe']) else 0
    features['MissingTitle'] = 1 if not soup.title or not soup.title.string.strip() else 0
    features['ImagesOnlyInForm'] = images_only
    
    # Frequent domain mismatch
    all_domains = []
    for tag in anchors + resources:
        attr = tag.get('href') or tag.get('src') or ""
        dom = urlparse(attr if attr.startswith('http') else '').netloc or netloc
        all_domains.append(dom)
    if all_domains:
        most_common = Counter(all_domains).most_common(1)[0][0]
        features['FrequentDomainNameMismatch'] = 1 if (most_common and most_common != netloc) else 0
    else:
        features['FrequentDomainNameMismatch'] = 0
    
    # External meta/script/link tags fraction
    ext_ms = total_ms = 0
    for tag in soup.find_all(['script','link']):
        total_ms += 1
        url_attr = tag.get('src') if tag.name=='script' else tag.get('href')
        if url_attr:
            dom = urlparse(url_attr if url_attr.startswith('http') else '').netloc
            if dom and dom != netloc:
                ext_ms += 1
    features['ExtMetaScriptLinkRT'] = (ext_ms/total_ms) if total_ms>0 else 0
    
    return features


In [3]:
def add_rule_based_features(features):
    rule_features = {}

    # Subdomain level rule (too many subdomains → suspicious)
    if features['SubdomainLevel'] <= 1:
        rule_features['SubdomainLevelRT'] = 1
    elif features['SubdomainLevel'] == 2:
        rule_features['SubdomainLevelRT'] = 0
    else:
        rule_features['SubdomainLevelRT'] = -1

    # URL length rule
    if features['UrlLength'] < 54:
        rule_features['UrlLengthRT'] = 1
    elif 54 <= features['UrlLength'] <= 75:
        rule_features['UrlLengthRT'] = 0
    else:
        rule_features['UrlLengthRT'] = -1

    # External resource URLs rule
    if features['PctExtResourceUrls'] < 0.22:
        rule_features['PctExtResourceUrlsRT'] = 1
    elif 0.22 <= features['PctExtResourceUrls'] <= 0.61:
        rule_features['PctExtResourceUrlsRT'] = 0
    else:
        rule_features['PctExtResourceUrlsRT'] = -1

    # Abnormal external form action
    rule_features['AbnormalExtFormActionR'] = -1 if features['AbnormalExtFormActionR'] else 1

    # ExtMetaScriptLink ratio
    if features['ExtMetaScriptLinkRT'] < 0.17:
        rule_features['ExtMetaScriptLinkRT'] = 1
    elif 0.17 <= features['ExtMetaScriptLinkRT'] <= 0.81:
        rule_features['ExtMetaScriptLinkRT'] = 0
    else:
        rule_features['ExtMetaScriptLinkRT'] = -1

    # Null/self redirect hyperlinks
    if features['PctNullSelfRedirectHyperlinks'] < 0.31:
        rule_features['PctExtNullSelfRedirectHyperlinksRT'] = 1
    elif 0.31 <= features['PctNullSelfRedirectHyperlinks'] <= 0.67:
        rule_features['PctExtNullSelfRedirectHyperlinksRT'] = 0
    else:
        rule_features['PctExtNullSelfRedirectHyperlinksRT'] = -1

    return rule_features


In [4]:
raw_features = extract_features("https://en.wikipedia.org/wiki/IIT_Ropar")
rule_based = add_rule_based_features(raw_features)

full_features = {**raw_features, **rule_based}
print(full_features)

{'id': 6936925, 'NumDots': 2, 'SubdomainLevel': 1, 'PathLevel': 1, 'UrlLength': 39, 'NumDash': 0, 'NumDashInHostname': 0, 'AtSymbol': 0, 'TildeSymbol': 0, 'NumUnderscore': 1, 'NumPercent': 0, 'NumQueryComponents': 0, 'NumAmpersand': 0, 'NumHash': 0, 'NumNumericChars': 0, 'NoHttps': 0, 'RandomString': 0, 'IpAddress': 0, 'DomainInSubdomains': 0, 'DomainInPaths': 0, 'HttpsInHostname': 0, 'HostnameLength': 16, 'PathLength': 15, 'QueryLength': 0, 'DoubleSlashInPath': 0, 'NumSensitiveWords': 0, 'EmbeddedBrandName': 0, 'PctExtHyperlinks': 0, 'PctExtNullSelfRedirectHyperlinksRT': 1, 'PctNullSelfRedirectHyperlinks': 0, 'PctExtResourceUrls': 0, 'PctExtResourceUrlsRT': 1, 'ExtFavicon': 0, 'InsecureForms': 0, 'RelativeFormAction': 0, 'ExtFormAction': 0, 'AbnormalFormAction': 0, 'AbnormalExtFormActionR': 1, 'SubmitInfoToEmail': 0, 'FakeLinkInStatusBar': 0, 'RightClickDisabled': 0, 'PopUpWindow': 0, 'IframeOrFrame': 0, 'MissingTitle': 1, 'ImagesOnlyInForm': 0, 'FrequentDomainNameMismatch': 0, 'ExtMe

In [5]:
full_features

{'id': 6936925,
 'NumDots': 2,
 'SubdomainLevel': 1,
 'PathLevel': 1,
 'UrlLength': 39,
 'NumDash': 0,
 'NumDashInHostname': 0,
 'AtSymbol': 0,
 'TildeSymbol': 0,
 'NumUnderscore': 1,
 'NumPercent': 0,
 'NumQueryComponents': 0,
 'NumAmpersand': 0,
 'NumHash': 0,
 'NumNumericChars': 0,
 'NoHttps': 0,
 'RandomString': 0,
 'IpAddress': 0,
 'DomainInSubdomains': 0,
 'DomainInPaths': 0,
 'HttpsInHostname': 0,
 'HostnameLength': 16,
 'PathLength': 15,
 'QueryLength': 0,
 'DoubleSlashInPath': 0,
 'NumSensitiveWords': 0,
 'EmbeddedBrandName': 0,
 'PctExtHyperlinks': 0,
 'PctExtNullSelfRedirectHyperlinksRT': 1,
 'PctNullSelfRedirectHyperlinks': 0,
 'PctExtResourceUrls': 0,
 'PctExtResourceUrlsRT': 1,
 'ExtFavicon': 0,
 'InsecureForms': 0,
 'RelativeFormAction': 0,
 'ExtFormAction': 0,
 'AbnormalFormAction': 0,
 'AbnormalExtFormActionR': 1,
 'SubmitInfoToEmail': 0,
 'FakeLinkInStatusBar': 0,
 'RightClickDisabled': 0,
 'PopUpWindow': 0,
 'IframeOrFrame': 0,
 'MissingTitle': 1,
 'ImagesOnlyInForm'

In [7]:
import pandas as pd
feature_columns = [
    'id','NumDots','SubdomainLevel','PathLevel','UrlLength','NumDash','NumDashInHostname','AtSymbol',
    'TildeSymbol','NumUnderscore','NumPercent','NumQueryComponents','NumAmpersand','NumHash',
    'NumNumericChars','NoHttps','RandomString','IpAddress','DomainInSubdomains','DomainInPaths',
    'HttpsInHostname','HostnameLength','PathLength','QueryLength','DoubleSlashInPath','NumSensitiveWords',
    'EmbeddedBrandName','PctExtHyperlinks','PctExtResourceUrls','ExtFavicon','InsecureForms','RelativeFormAction',
    'ExtFormAction','AbnormalFormAction','PctNullSelfRedirectHyperlinks','FrequentDomainNameMismatch',
    'FakeLinkInStatusBar','RightClickDisabled','PopUpWindow','SubmitInfoToEmail','IframeOrFrame',
    'MissingTitle','ImagesOnlyInForm','SubdomainLevelRT','UrlLengthRT','PctExtResourceUrlsRT',
    'AbnormalExtFormActionR','ExtMetaScriptLinkRT','PctExtNullSelfRedirectHyperlinksRT'
]
df = pd.DataFrame([full_features])
df = df[feature_columns]
df.to_csv("data/yeahtest_featuresv1.csv", index=False)

print("✅ Features saved to test_featuresv1.csv")

✅ Features saved to test_featuresv1.csv


In [10]:
json_ready_data = df.to_dict(orient="records")
payload = {"data": json_ready_data}
import json
print(json.dumps(payload, indent=2))

{
  "data": [
    {
      "id": 6936925,
      "NumDots": 2,
      "SubdomainLevel": 1,
      "PathLevel": 1,
      "UrlLength": 39,
      "NumDash": 0,
      "NumDashInHostname": 0,
      "AtSymbol": 0,
      "TildeSymbol": 0,
      "NumUnderscore": 1,
      "NumPercent": 0,
      "NumQueryComponents": 0,
      "NumAmpersand": 0,
      "NumHash": 0,
      "NumNumericChars": 0,
      "NoHttps": 0,
      "RandomString": 0,
      "IpAddress": 0,
      "DomainInSubdomains": 0,
      "DomainInPaths": 0,
      "HttpsInHostname": 0,
      "HostnameLength": 16,
      "PathLength": 15,
      "QueryLength": 0,
      "DoubleSlashInPath": 0,
      "NumSensitiveWords": 0,
      "EmbeddedBrandName": 0,
      "PctExtHyperlinks": 0,
      "PctExtResourceUrls": 0,
      "ExtFavicon": 0,
      "InsecureForms": 0,
      "RelativeFormAction": 0,
      "ExtFormAction": 0,
      "AbnormalFormAction": 0,
      "PctNullSelfRedirectHyperlinks": 0,
      "FrequentDomainNameMismatch": 0,
      "FakeLinkInStatus