In [11]:
# ============================================================
# Install dependencies (for Exterior.html extraction only)
# ============================================================
!pip -q install beautifulsoup4 lxml readability-lxml


[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
# ============================================================
# Imports + extraction helper (Exterior.html only)
# ============================================================
import os
from urllib.parse import urljoin, urldefrag

from bs4 import BeautifulSoup


def extract_visible_text(html: str, remove_header_footer: bool = True, base_url: str = None) -> str:
    """
    Extract readable text from HTML (Exterior.html structure).
    Removes modals, header, footer, nav. Targets #main content.
    If base_url is set, preserves links as "link text (full_url)".
    """
    soup = BeautifulSoup(html, "lxml")

    # Remove modals and dialogs first (so they are not mistaken for main content)
    for el in soup.select(".modal, [role='dialog'], .modal-dialog"):
        el.decompose()

    if remove_header_footer:
        for tag in soup.find_all(["header", "footer"]):
            tag.decompose()
        for sel in ["nav", "#searchbar", ".breadcrumb", ".migas"]:
            for el in soup.select(sel):
                el.decompose()

    # Target main content: Exterior.html (policia) uses id="main"
    main = soup.find(id="main")
    if main:
        for skip in main.select("#calendarioFechaHora, .migas"):
            skip.decompose()
        content_root = main
    else:
        content_root = soup

    for tag in content_root(["script", "style", "noscript", "svg", "canvas", "iframe", "form"]):
        tag.decompose()

    if base_url:
        for a in content_root.find_all("a", href=True):
            href = a.get("href", "").strip()
            if not href or href.startswith("javascript:"):
                continue
            absolute, _ = urldefrag(urljoin(base_url, href))
            link_text = a.get_text(separator=" ", strip=True) or "(link)"
            a.clear()
            a.append(link_text + " (" + absolute + ")")

    text = content_root.get_text(separator="\n")
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    return "\n".join(lines)

In [13]:
# ============================================================
# Configuration â€” Exterior.html only
# ============================================================
HTML_FILE = os.path.join(os.getcwd(), "Exterior.html")
OUTPUT_DIR = os.getcwd()
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_TXT = os.path.join(OUTPUT_DIR, "Exterior_extracted.txt")

# Base URL for this Exterior.html (Spanish Police NIE page); change if your file is from a different site
SOURCE_URL = "https://sede.policia.gob.es/portalCiudadano/_ca-valencia/tramites_extranjeria_tramite_asignacion_nie.php"

print("HTML file:", HTML_FILE)
print("Source URL:", SOURCE_URL)
print("Output:", OUTPUT_TXT)
print("Exists:", os.path.isfile(HTML_FILE))

HTML file: c:\Users\grego\Downloads\New folder (10)\Exterior.html
Source URL: https://www.exteriores.gob.es/Embajadas/telaviv/en/ServiciosConsulares/Paginas/Consular/NIE.aspx
Output: c:\Users\grego\Downloads\New folder (10)\Exterior_extracted.txt
Exists: True


In [14]:
# ============================================================
# Load Exterior.html
# ============================================================
with open(HTML_FILE, "r", encoding="utf-8") as f:
    html_content = f.read()

print(f"Loaded {len(html_content):,} characters from Exterior.html")

Loaded 64,217 characters from Exterior.html


In [15]:
# ============================================================
# Extract text (header/footer removed, links preserved) -> save .txt
# ============================================================
text = extract_visible_text(html_content, remove_header_footer=True, base_url=SOURCE_URL)

with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
    f.write("Source: " + SOURCE_URL + "\n\n")
    f.write(text)

print("Done. Saved to:", OUTPUT_TXT)
print("Extracted length:", len(text), "characters")

Done. Saved to: c:\Users\grego\Downloads\New folder (10)\Exterior_extracted.txt
Extracted length: 111 characters
