# ELROND Dataset

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen
from urllib.request import urljoin
from urllib.request import Request
from urllib.parse import urlparse
import os
from pathlib import Path  

url_list = [
    'https://developer.toradex.com',
    'https://docs.u-boot.org/en/latest/index.html',
    'https://docs.kernel.org',
	'https://docs.yoctoproject.org',
	'https://elinux.org/Main_Page',
	'https://wiki.archlinux.org/title/Table_of_contents',
]

In [None]:
visited_urls = set()
internal_links = []
max_depth = 10  # Set the maximum depth for recursion
count = 0

filepath = Path('../../dataset/checkpoint_links_list.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  

def save_links(links, filename=filepath):
    df = pd.DataFrame(links, columns=["URL"])
    df.to_csv(filepath, index=False)
    print(f"Saved {len(links)} links to {filename}")

# Function to extract and filter internal links
def get_internal_links(base_url, soup):
    internal_links = set()
    
    # General case for extracting internal links
    for link in soup.find_all('a', href=True):
        href = link.get('href')
        if href.startswith('/'):
            full_url = urljoin(base_url, href)
            internal_links.add(full_url)
        elif href.startswith(base_url):
            internal_links.add(href)
        elif href.startswith('.') or href.startswith('#'):
            full_url = urljoin(base_url, href)
            internal_links.add(full_url)
    
    # Specific case for toctree structures
    for li in soup.find_all('li', class_=['toctree-l1', 'toctree-l2']):
        for link in li.find_all('a', class_='reference internal'):
            href = link.get('href')
            if href:
                if href.startswith('/'):
                    full_url = urljoin(base_url, href)
                    internal_links.add(full_url)
                elif href.startswith('http'):
                    parsed_base_url = urlparse(base_url)
                    parsed_href = urlparse(href)
                    if parsed_href.netloc == parsed_base_url.netloc:
                        internal_links.add(href)
                else:
                    full_url = urljoin(base_url, href)
                    internal_links.add(full_url)

    return internal_links

def scrape_links(url, depth):
    global count
    if depth > max_depth:
        return
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            links = get_internal_links(url, soup)

            for link in links:
                if link not in visited_urls:
                    visited_urls.add(link)
                    internal_links.append(link)
                    count += 1

                    if count % 100 == 0:
                        save_links(internal_links)

                    # Recursively scrape the link
                    scrape_links(link, depth + 1)
        else:
            print(f"Failed to retrieve the webpage at {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred while scraping {url}: {e}")


In [None]:
# Start scraping from the base URL
for base_url in url_list:
    scrape_links(base_url, 0)

# Save remaining links if any
if internal_links:
    save_links(internal_links)