In [None]:
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import whois
import pickle
import time
import pandas as pd

In [None]:
def is_internal_link(link, base_domain):
    parsed = urlparse(link)
    return parsed.netloc == '' or base_domain in parsed.netloc

def is_null_link(link):
    if not link:
        return True
    href = link.strip().lower()
    return (
        href == '#' or
        href.startswith('#') or
        href.startswith('javascript:') or
        href in ('javascript::void(0)', 'javascript:void(0)', '/')
    )

def safe_request(url):
    """Perform a GET request safely, returning response or None."""
    try:
        response = requests.get(url, timeout=5, allow_redirects=True)
        return response
    except Exception:
        return None

def extract_domain(url):
    """Extract domain from a URL (without scheme)."""
    return urlparse(url).netloc


def calculate_features(url, soup, with_redirect=False):
    """
    Calculate web features like number of internal/external links,
    CSS, redirects, errors, favicon, and null links.
    """
    features = {
        "total_links": 0,
        "internal_links": 0,
        "external_links": 0,
        "null_links": 0,
        "internal_css": 0,
        "external_css": 0,
        "internal_errors": 0,
        "external_errors": 0,
        "internal_favicon": 0,
        "external_favicon": 0
    }

    if with_redirect:
        features.update({
            "internal_redirects": 0,
            "external_redirects": 0
        })

    base_domain = extract_domain(url)

    # 1. Hyperlinks
    links = [a.get("href") for a in soup.find_all("a", href=True)]
    features["total_links"] = len(links)

    for link in links:
        try:
            if is_null_link(link):
                features["null_links"] += 1
                continue

            full_link = urljoin(url, link)
        except Exception:
            continue

        if is_internal_link(full_link, base_domain):
            features["internal_links"] += 1
        else:
            features["external_links"] += 1

        # optional redirect/error checks
        if with_redirect:
            response = safe_request(full_link)
            if response:
                if len(response.history) > 0:  # redirected
                    if is_internal_link(response.url, base_domain):
                        features["internal_redirects"] += 1
                    else:
                        features["external_redirects"] += 1
                if response.status_code >= 400:  # error
                    if is_internal_link(full_link, base_domain):
                        features["internal_errors"] += 1
                    else:
                        features["external_errors"] += 1

    #  2. CSS
    css_links = [link.get("href") for link in soup.find_all("link", rel="stylesheet") if link.get("href")]
    for css in css_links:
        full_css = urljoin(url, css)
        if is_internal_link(full_css, base_domain):
            features["internal_css"] += 1
        else:
            features["external_css"] += 1

    #3. Favicon
    favicons = [
        link.get("href")
        for link in soup.find_all("link", rel=lambda v: v and "icon" in v)
        if link.get("href")
    ]
    for icon in favicons:
        full_icon = urljoin(url, icon)
        if is_internal_link(full_icon, base_domain):
            features["internal_favicon"] += 1
        else:
            features["external_favicon"] += 1

    return features


def main(url, soup, with_redirect=False, log=False):
    features = calculate_features(url, soup, with_redirect)
    if log:
        print(f"Feature summary for {url}:")
        for key, value in features.items():
            print(f"{key}: {value}")
    return features


In [None]:
def decorate_message(message : str):
    print('*'*len(message))
    print(message)
    print('*'*len(message))

In [None]:
dom_folder = './doms/'
output_folder = './hyperlinks_without_redirects/'
for i in range(8):
    input_filename = f'dom_data{i}.pkl'

    message = f"Extracting hyperlink data from {input_filename}"
    decorate_message(message)

    print(f"Reading {input_filename}")
    start = time.time()
    with open(dom_folder+input_filename, 'rb') as dom_file:
        doms = pickle.load(dom_file)
    print(f"{input_filename} read, Time taken: {time.time() - start:.2f}s")
    
    output = doms[:]
    
    beginning = start
    total_doms = len(output)
    start = time.time()

    for index in range(total_doms):
        output[index][2] = main(output[index][1], output[index][2], True)
        end = time.time()
        if (index%1000 == 0 and index) or (end - start) > 30:
            print(f"Current Progress: {index}/{total_doms}, Time Taken: {end - start:.2f}s")
            start = time.time()

    filename = f'hyperlink_data{i}.pkl'
    print(f"Saving {input_filename}'s hyperlink data to file: {filename}")

    with open(output_folder+filename, 'wb') as output_file:
        pickle.dump(output, output_file)

    print(f"Total time taken: {time.time() - beginning:.2f}s")

In [None]:
hyperlink_folder = output_folder = './hyperlinks_without_redirects/'
hyperlink_df = pd.DataFrame()

for i in range(8):
    input_filename = f'hyperlink_data{i}.pkl'

    with open(hyperlink_folder+input_filename, 'rb') as hyperlink_file:
        hyperlink_data = pickle.load(hyperlink_file)
        
    current_df = pd.DataFrame(
        [
            {
                'index': row[0],
                'url': row[1],
                **row[2],
                'label': row[3]
            }
            for row in hyperlink_data
        ]
    ).set_index('index')
    hyperlink_df = pd.concat([hyperlink_df, current_df], ignore_index=False)

In [None]:
hyperlink_df.to_csv(output_folder+'hyperlink_data.csv', index=False)