In [1]:
# ============================================================
# Cell 1) Install dependencies
# ============================================================
!pip -q install beautifulsoup4 lxml readability-lxml requests tqdm


[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# ============================================================
# Cell 2) Imports + helpers
# ============================================================
import os
import re
import time
import hashlib
from urllib.parse import urljoin, urlparse, urldefrag

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# readability-lxml helps remove menus/boilerplate when possible
from readability import Document

OUTPUT_DIR = "/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; TextExtractor/1.0; +https://colab.research.google.com/)"
}

def safe_filename_from_url(url: str, max_len: int = 120) -> str:
    """Create a stable, filesystem-safe filename from a URL."""
    parsed = urlparse(url)
    base = (parsed.netloc + parsed.path).strip("/")
    base = base if base else "root"
    base = re.sub(r"[^a-zA-Z0-9._-]+", "_", base)
    base = base[:max_len].strip("_")
    h = hashlib.sha1(url.encode("utf-8")).hexdigest()[:10]
    return f"{base}_{h}.txt"

def fetch_html(url: str, timeout: int = 25) -> str:
    r = requests.get(url, headers=HEADERS, timeout=timeout)
    r.raise_for_status()
    # requests usually guesses encoding; if not, this helps
    r.encoding = r.apparent_encoding
    return r.text

def extract_visible_text(html: str, remove_header_footer: bool = True) -> str:
    """
    Extract readable visible text from HTML.
    Uses readability-lxml to get main content, then BeautifulSoup to textify.
    Optionally removes header and footer elements before extraction.
    """
    soup = BeautifulSoup(html, "lxml")

    # Remove headers and footers first (before readability)
    if remove_header_footer:
        for tag in soup.find_all(["header", "footer"]):
            tag.decompose()
        # Common nav/site chrome classes (adjust if needed for your HTML)
        for selector in ["nav", ".header__main-Wrapper", ".header__nav", ".footer__info-Wrapper",
                         ".list__sponsor-Wrapper", ".list__links-Wrapper", ".breadcrumb", ".header__breadcrumb-static"]:
            for el in soup.select(selector):
                el.decompose()

    # Use readability to pull main article-like content (fallback to full body)
    try:
        doc = Document(str(soup))
        content_html = doc.summary(html_partial=True)
        soup = BeautifulSoup(content_html, "lxml")
    except Exception:
        pass

    # Remove non-content tags
    for tag in soup(["script", "style", "noscript", "svg", "canvas", "iframe", "form"]):
        tag.decompose()

    # Get text, normalize whitespace
    text = soup.get_text(separator="\n")

    # Clean up: collapse repeated blank lines, trim lines
    lines = [ln.strip() for ln in text.splitlines()]
    lines = [ln for ln in lines if ln]  # drop empty lines
    cleaned = "\n".join(lines)

    return cleaned

def get_links_from_page(html: str, base_url: str):
    soup = BeautifulSoup(html, "lxml")
    links = []
    for a in soup.select("a[href]"):
        href = a.get("href", "").strip()
        if not href:
            continue
        absolute = urljoin(base_url, href)
        absolute, _frag = urldefrag(absolute)  # remove #fragment
        links.append(absolute)
    return links

def same_domain(url: str, domain: str) -> bool:
    return urlparse(url).netloc.lower() == domain.lower()



In [None]:
# ============================================================
# Cell 3) Configuration â€” read from local Exterior.html
# ============================================================
# Path to the local HTML file (same folder as notebook / cwd)
HTML_FILE = os.path.join(os.getcwd(), "Exterior.html")
OUTPUT_DIR = os.getcwd()
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_TXT = os.path.join(OUTPUT_DIR, "Exterior_extracted.txt")

print("HTML file:", HTML_FILE)
print("Output text:", OUTPUT_TXT)
print("HTML exists:", os.path.isfile(HTML_FILE))

HTML file: c:\Users\grego\Downloads\New folder (10)\Exterior.html
Output text: c:\Users\grego\Downloads\New folder (10)\Exterior_extracted.txt
HTML exists: True


In [4]:
# ============================================================
# Cell 4) Load HTML from Exterior.html
# ============================================================
with open(HTML_FILE, "r", encoding="utf-8") as f:
    html_content = f.read()

print(f"Loaded {len(html_content):,} characters from {HTML_FILE}")

Loaded 40,546 characters from c:\Users\grego\Downloads\New folder (10)\Exterior.html


In [None]:
# ============================================================
# Cell 5) Extract text (headers/footers removed) -> save .txt
# ============================================================
text = extract_visible_text(html_content, remove_header_footer=True)

with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
    f.write("Source: Exterior.html\n\n")
    f.write(text)

print("Done.")
print("Saved to:", OUTPUT_TXT)
print("Extracted length:", len(text), "characters")

Done.
Saved to: c:\Users\grego\Downloads\New folder (10)\Exterior_extracted.txt
Extracted length: 3362 characters
