In [74]:
import os
import re
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup


def is_allowed(url: str, allowed_domains: list[str]) -> bool:
    domain = urlparse(url).netloc.lower().replace("www.", "")
    for d in allowed_domains:
        d = d.lower().replace("www.", "")
        if domain == d or domain.endswith("." + d):
            return True
    return False


def url_to_filename(url: str) -> str:
    parsed = urlparse(url)
    name = parsed.netloc + parsed.path
    if name.endswith("/"):
        name += "index"
    name = re.sub(r"[^a-zA-Z0-9._-]+", "_", name)
    return name + ".html"


# ---- main ----
urls = config["sources"]["seed_urls"]
allowed_domains = config["sources"]["allowed_domains"]

save_dir = "notebook/ingest"
os.makedirs(save_dir, exist_ok=True)

legal_urls = []
illegal_urls = []

for url in urls:
    if not is_allowed(url, allowed_domains):
        illegal_urls.append(url)
        continue

    legal_urls.append(url)

    try:
        response = requests.get(url, timeout=20)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        file_name = url_to_filename(url)
        file_path = os.path.join(save_dir, file_name)

        with open(file_path, "w", encoding="utf-8") as f:
            f.write(soup.prettify())

        print("Saved:", file_path)

    except Exception as e:
        print("Failed:", url, "|", e)

print("\nLegal URLs:", len(legal_urls))
print("Illegal URLs:", len(illegal_urls))

Saved: notebook/ingest/www.hhs.texas.gov_services_food_snap-food-benefits.html
Saved: notebook/ingest/www.yourtexasbenefits.com_Learn_Home.html
Saved: notebook/ingest/www.hhs.texas.gov_handbooks_texas-works-handbook.html
Failed: https://www.hhs.texas.gov/services/food/snap-food-benefits/report-changes | 404 Client Error: Not Found for url: https://www.hhs.texas.gov/services/food/snap-food-benefits/report-changes
Failed: https://www.hhs.texas.gov/services/food/snap-food-benefits/how-apply | 404 Client Error: Not Found for url: https://www.hhs.texas.gov/services/food/snap-food-benefits/how-apply

Legal URLs: 5
Illegal URLs: 0


In [75]:
print("HTTP", response.status_code, url)

HTTP 404 https://www.hhs.texas.gov/services/food/snap-food-benefits/how-apply
