In [None]:
https://www.citizensinformation.ie/en/social-welfare/irish-social-welfare-system/personal-public-service-number

In [1]:
import os
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

# Setup ChromeDriver
service = Service('/usr/bin/chromedriver')
options = webdriver.ChromeOptions()
# options.add_argument('--headless')  # Enable this if you want headless mode
driver = webdriver.Chrome(service=service, options=options)

# Constants
BASE_URLS = [
    "https://www.citizensinformation.ie/en/social-welfare/irish-social-welfare-system/personal-public-service-number/"
]
OUTPUT_DIR = "IRP"
VISITED = set()

# Ensure base directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

def sanitize_path(url):
    parsed = urlparse(url)
    path = parsed.path.strip("/")
    if not path or path.endswith("/"):
        path += "index"
    path = path.replace("-", "_")
    return os.path.join(OUTPUT_DIR, path)

# This was reused but the result is fine
def normalize_url(url):
    return urljoin("https://www.irishimmigration.ie", url).split("#")[0].rstrip("/")

def is_valid_link(href):
    if not href:
        return False
    abs_url = normalize_url(href)
    return any(abs_url.startswith(base.rstrip("/")) for base in BASE_URLS)

def download_document(url, save_path):
    ext = os.path.splitext(urlparse(url).path)[1]
    if not save_path.endswith(ext):
        save_path += ext

    try:
        r = requests.get(url)
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, "wb") as f:
            f.write(r.content)
        print(f"[DOC] {url} -> {save_path}")
    except Exception as e:
        print(f"Download failed: {url} ({e})")

def extract_markdown_from_body(body):
    output = []

    def append_text(tag, prefix=""):
        text = tag.get_text(strip=True)
        if text:
            output.append(f"{prefix}{text}")

    for elem in body.descendants:
        if elem.name in {"script", "style", "noscript"}:
            continue
        if elem.name == "h1":
            append_text(elem, "# ")
        elif elem.name == "h2":
            append_text(elem, "## ")
        elif elem.name == "h3":
            append_text(elem, "### ")
        elif elem.name == "p":
            append_text(elem)
        elif elem.name == "li":
            append_text(elem, "- ")
        elif elem.name == "pre":
            code = elem.get_text().strip()
            output.append("```")
            output.append(code)
            output.append("```")
        elif elem.name == "blockquote":
            append_text(elem, "> ")

    return "\n\n".join(output)

def scrape_page(url):
    norm_url = normalize_url(url)
    if norm_url in VISITED:
        return
    VISITED.add(norm_url)

    driver.get(url)
    time.sleep(1.5)  # Let JavaScript render
    soup = BeautifulSoup(driver.page_source, "html.parser")
    body = soup.body

    if body:
        for tag in body(["script", "style", "noscript"]):
            tag.decompose()
        markdown_text = extract_markdown_from_body(body)

        if markdown_text:
            rel_path = sanitize_path(url) + ".txt"
            os.makedirs(os.path.dirname(rel_path), exist_ok=True)
            with open(rel_path, "w", encoding="utf-8") as f:
                f.write(markdown_text)
            print(f"[PAGE] {url} -> {rel_path}")
        else:
            print(f"No visible text found at {url}")

    # Recurse on links
    for a in soup.find_all("a", href=True):
        href = a["href"]
        abs_url = urljoin(url, href)
        norm_abs_url = normalize_url(abs_url)

        if norm_abs_url in VISITED:
            continue

        if any(abs_url.lower().endswith(ext) for ext in [".pdf"]):
            save_path = sanitize_path(abs_url)
            download_document(abs_url, save_path)
        elif is_valid_link(abs_url):
            scrape_page(abs_url)

# Start crawling
for url in BASE_URLS:
    scrape_page(url)

driver.quit()


[PAGE] https://www.citizensinformation.ie/en/social-welfare/irish-social-welfare-system/personal-public-service-number/ -> IRP/en/social_welfare/irish_social_welfare_system/personal_public_service_number.txt
[DOC] https://assets.gov.ie/static/documents/application-form-pps-number-for-a-child-under-18-reg1m.pdf -> IRP/static/documents/application_form_pps_number_for_a_child_under_18_reg1m.pdf
[DOC] https://assets.gov.ie/static/documents/application-form-pps-number-reg1.pdf -> IRP/static/documents/application_form_pps_number_reg1.pdf
[DOC] https://assets.gov.ie/static/documents/pps-number-third-party-consent-form.pdf -> IRP/static/documents/pps_number_third_party_consent_form.pdf
[DOC] https://assets.gov.ie/static/documents/pps-number-questionnaire.pdf -> IRP/static/documents/pps_number_questionnaire.pdf


In [6]:
!ls IRP/en/social_welfare/irish_social_welfare_system

personal_public_service_number.txt


In [10]:
!ls IRP/static/documents/

application_form_pps_number_for_a_child_under_18_reg1m.pdf
application_form_pps_number_reg1.pdf
pps_number_questionnaire.pdf
pps_number_third_party_consent_form.pdf
