In [None]:
!pip -q install -U spacy pandas requests tldextract beautifulsoup4 lxml trafilatura readability-lxml newspaper3k tqdm
!pip -q install -U spacy spacy-transformers
!python -m spacy download en_core_web_trf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m125.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.4/106.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
import spacy

gpu_active = spacy.prefer_gpu()
print("✅ GPU active:", gpu_active)



✅ GPU active: True


In [None]:
import os, sys, glob, logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# 👉 Set your uploaded CSV paths here (adjust if your names differ)
INPUT_FILES = [
    "/content/tesla_gdelt_2025_revised.csv",
    "/content/elon_related_2025.csv",
]

# Quick helper: if any file missing, try to auto-find a close match in /content
def autofind_missing(paths):
    fixed = []
    for p in paths:
        if os.path.exists(p):
            fixed.append(p)
            continue
        base = os.path.basename(p)
        stem = os.path.splitext(base)[0].split()[0]
        candidates = glob.glob(f"/content/{stem}*.csv")
        if candidates:
            logging.info(f"Auto-selected for {base}: {candidates[0]}")
            fixed.append(candidates[0])
        else:
            fixed.append(p)
    return fixed

INPUT_FILES = autofind_missing(INPUT_FILES)

OUTPUT_DIR = "/content/entity_output_fast"
TMP_DIR     = "/content/_entity_tmp"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(TMP_DIR, exist_ok=True)

print("Looking for files:")
missing = []
for f in INPUT_FILES:
    print("  ", f, "✓" if os.path.exists(f) else "✗")
    if not os.path.exists(f):
        missing.append(f)

if missing:
    raise FileNotFoundError(f"Missing files: {missing}\nUpload them to Colab (left Files panel) or correct INPUT_FILES.")


Looking for files:
   /content/tesla_gdelt_2025_revised.csv ✓
   /content/elon_related_2025.csv ✓


In [None]:
import re
import pandas as pd
import tldextract
from bs4 import BeautifulSoup
import trafilatura
from readability import Document as ReadabilityDocument

URL_COLUMNS_CANDIDATES = ["SOURCEURL", "DocumentIdentifier", "sourceurl", "url", "URL"]
MIN_TEXT_CHARS = 250  # shorter pages are usually nav/boilerplate

def pick_url_column(df: pd.DataFrame) -> str:
    cols_lower = {c.lower(): c for c in df.columns}
    for cand in URL_COLUMNS_CANDIDATES:
        if cand.lower() in cols_lower:
            return cols_lower[cand.lower()]
    raise KeyError(f"None of the expected URL columns found: {URL_COLUMNS_CANDIDATES}")

def load_urls(files) -> pd.DataFrame:
    frames = []
    for f in files:
        df = pd.read_csv(f, low_memory=False)
        url_col = pick_url_column(df)
        part = df[[url_col]].rename(columns={url_col: "SOURCEURL"})
        part["SOURCEURL"] = part["SOURCEURL"].astype(str).str.strip()
        frames.append(part)
    all_urls = pd.concat(frames, ignore_index=True).dropna()
    all_urls = all_urls[all_urls["SOURCEURL"].str.startswith(("http://", "https://"), na=False)]
    all_urls = all_urls.drop_duplicates(subset=["SOURCEURL"]).reset_index(drop=True)
    return all_urls

def domain_of(url: str) -> str:
    ext = tldextract.extract(url)
    return ".".join([p for p in [ext.domain, ext.suffix] if p])

def extract_main_text(html_text: str):
    # 1) trafilatura (usually best for articles)
    try:
        txt = trafilatura.extract(html_text, include_comments=False, include_tables=False)
        if txt and len(txt) >= MIN_TEXT_CHARS:
            soup = BeautifulSoup(html_text, "lxml")
            title = soup.title.string.strip() if soup.title and soup.title.string else None
            return title, txt, len(txt)
    except Exception:
        pass
    # 2) readability
    try:
        rd = Document = ReadabilityDocument(html_text)
        cleaned = rd.summary(html_partial=False)
        soup = BeautifulSoup(cleaned, "lxml")
        txt = soup.get_text(separator="\n").strip()
        if txt and len(txt) >= MIN_TEXT_CHARS:
            return rd.short_title() or None, txt, len(txt)
    except Exception:
        pass
    # 3) fallback: visible text
    try:
        soup = BeautifulSoup(html_text, "lxml")
        for s in soup(["script","style","noscript"]): s.extract()
        txt = soup.get_text(separator="\n").strip()
        txt = re.sub(r"\n{3,}", "\n\n", txt)
        if txt and len(txt) >= MIN_TEXT_CHARS:
            title = soup.title.string.strip() if soup.title and soup.title.string else None
            return title, txt, len(txt)
    except Exception:
        pass
    return None, None, 0


In [None]:
import math, asyncio, nest_asyncio, aiohttp
from tqdm.auto import tqdm

# ---- Speed tunables ----
CONCURRENCY = 30      # parallel connections (try 20–50)
WAVE_SIZE   = 3000    # URLs per wave to keep memory reasonable
HEADERS     = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) FastEntityMiner/1.0"}

# temp files (parquet) so we don't keep everything in RAM
RESULTS_PARQUET = f"{TMP_DIR}/_results.parquet"
FAILED_PARQUET  = f"{TMP_DIR}/_failed.parquet"
META_PARQUET    = f"{TMP_DIR}/_meta.parquet"

# clean previous temp files if re-running
for p in [RESULTS_PARQUET, FAILED_PARQUET, META_PARQUET]:
    if os.path.exists(p):
        os.remove(p)

def append_parquet(df: pd.DataFrame, path: str):
    if df is None or df.empty:
        return
    if os.path.exists(path):
        old = pd.read_parquet(path)
        pd.concat([old, df], ignore_index=True).to_parquet(path, index=False)
    else:
        df.to_parquet(path, index=False)

async def fetch_one(session: aiohttp.ClientSession, url: str):
    try:
        async with session.get(url, headers=HEADERS) as r:
            if r.status >= 400:
                return url, None, "fetch_failed"
            html = await r.text(errors="ignore")
            title, txt, char_count = extract_main_text(html)
            if not txt:
                return url, None, "no_extractable_text"
            return url, (title, txt, char_count), None
    except Exception:
        return url, None, "fetch_error"

async def fetch_all(urls):
    connector = aiohttp.TCPConnector(limit_per_host=CONCURRENCY)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [fetch_one(session, u) for u in urls]
        results = []
        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Downloading"):
            results.append(await coro)
        return results

# Load URLs
urls_df = load_urls(INPUT_FILES)
urls = urls_df["SOURCEURL"].tolist()
print(f"Total unique URLs: {len(urls)}")

# Apply nest_asyncio to use the notebook loop
nest_asyncio.apply()
loop = asyncio.get_event_loop()

# Fetch in waves to avoid huge memory spikes
num_waves = math.ceil(len(urls) / WAVE_SIZE)
total_ok, total_fail = 0, 0

for w in range(num_waves):
    start, end = w * WAVE_SIZE, min((w+1) * WAVE_SIZE, len(urls))
    wave = urls[start:end]
    print(f"\nWave {w+1}/{num_waves} — {len(wave)} URLs")

    wave_results = loop.run_until_complete(fetch_all(wave))

    rows_txt, rows_fail, rows_meta = [], [], []
    for url, data, err in wave_results:
        if err or not data:
            rows_fail.append({"SOURCEURL": url, "reason": err or "unknown"})
            continue
        title, txt, char_count = data
        rows_txt.append({"SOURCEURL": url, "domain": domain_of(url), "title": title, "text": txt})
        rows_meta.append({"SOURCEURL": url, "domain": domain_of(url), "title": title, "char_count": char_count})

    total_ok += len(rows_txt)
    total_fail += len(rows_fail)

    append_parquet(pd.DataFrame(rows_txt),  RESULTS_PARQUET)
    append_parquet(pd.DataFrame(rows_fail), FAILED_PARQUET)
    append_parquet(pd.DataFrame(rows_meta), META_PARQUET)

print(f"\n✅ Downloaded: {total_ok} pages")
print(f"❌ Failed:     {total_fail} pages")

# Save failed + meta as CSVs for convenience
if os.path.exists(FAILED_PARQUET):
    pd.read_parquet(FAILED_PARQUET).to_csv(f"{OUTPUT_DIR}/failed_urls.csv", index=False)
if os.path.exists(META_PARQUET):
    pd.read_parquet(META_PARQUET).to_csv(f"{OUTPUT_DIR}/url_metadata.csv", index=False)


Total unique URLs: 69656

Wave 1/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 2/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 3/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 4/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 5/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 6/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 7/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 8/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 9/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 10/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 11/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 12/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 13/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 14/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 15/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 16/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 17/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 18/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 19/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 20/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 21/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 22/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 23/24 — 3000 URLs


Downloading:   0%|          | 0/3000 [00:00<?, ?it/s]

ERROR:trafilatura.utils:lxml parsing failed: Document is empty
ERROR:trafilatura.utils:lxml parser bytestring Document is empty
ERROR:trafilatura.core:empty HTML tree: None
ERROR:readability.readability:error getting summary: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 227, in summary
    self._html(True)
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 153, in _html
    self.html = self._parse(self.input)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/readability.py", line 166, in _parse
    doc, self.encoding = build_doc(input)
                         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/readability/htmls.py", line 20, in build_doc
    doc = lxml.html.document_fromstring(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lxml/html/__init__.py", line 742, in do


Wave 24/24 — 656 URLs


Downloading:   0%|          | 0/656 [00:00<?, ?it/s]




✅ Downloaded: 53465 pages
❌ Failed:     16191 pages


In [None]:
import spacy
from collections import Counter

PER_URL_CSV = f"{OUTPUT_DIR}/entities_per_url.csv"

# Remove previous entities file if re-running
if os.path.exists(PER_URL_CSV):
    os.remove(PER_URL_CSV)

# Load fetched texts
if not os.path.exists(RESULTS_PARQUET):
    raise RuntimeError("No downloaded pages found. Make sure the download step completed.")

texts_df = pd.read_parquet(RESULTS_PARQUET)
print(f"Texts ready for NER: {len(texts_df)}")


nlp = spacy.load("en_core_web_trf")

# Transformers model already handles sentence boundaries well,
# no need to add a sentencizer
BATCH_SIZE = 8    # smaller batch size for GPU models
N_PROCESS = 1     # keep 1, transformers don't parallelize well

# Process in chunks to keep memory low
CHUNK_ROWS = 5000

def write_entities_chunk(df_chunk: pd.DataFrame, header: bool):
    texts = df_chunk["text"].tolist()
    metas = df_chunk[["SOURCEURL", "domain", "title"]].to_dict(orient="records")

    rows = []
    for doc, meta in zip(nlp.pipe(texts, batch_size=BATCH_SIZE, n_process=N_PROCESS), metas):
        ents = [(e.text.strip(), e.label_) for e in doc.ents if e.text.strip()]
        if not ents:
            continue
        counts = Counter(ents)
        for (ent, label), c in counts.items():
            rows.append({
                "SOURCEURL": meta["SOURCEURL"],
                "domain": meta["domain"],
                "title": meta["title"],
                "entity": ent,
                "label": label,
                "count": c
            })

    if rows:
        out_df = pd.DataFrame(rows)
        out_df.to_csv(PER_URL_CSV, index=False, mode="a", header=header)

# Iterate chunked
header_needed = True
for start in range(0, len(texts_df), CHUNK_ROWS):
    end = min(start + CHUNK_ROWS, len(texts_df))
    print(f"NER chunk {start}:{end}")
    chunk = texts_df.iloc[start:end].copy()
    write_entities_chunk(chunk, header=header_needed)
    header_needed = False

print("✅ NER finished. Wrote:", PER_URL_CSV)


Texts ready for NER: 53465
NER chunk 0:5000
NER chunk 5000:10000
NER chunk 10000:15000
NER chunk 15000:20000
NER chunk 20000:25000
NER chunk 25000:30000
NER chunk 30000:35000
NER chunk 35000:40000
NER chunk 40000:45000
NER chunk 45000:50000
NER chunk 50000:53465
✅ NER finished. Wrote: /content/entity_output_fast/entities_per_url.csv


In [None]:
import csv
from collections import defaultdict

SUMMARY_CSV = f"{OUTPUT_DIR}/entities_summary.csv"

if not os.path.exists(PER_URL_CSV):
    raise FileNotFoundError("entities_per_url.csv not found — run the NER cell first.")

# Stream through entities_per_url.csv and aggregate without loading full file in RAM
agg_counts = defaultdict(int)
agg_urls   = defaultdict(set)

with open(PER_URL_CSV, "r", newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        key = (row["entity"], row["label"])
        agg_counts[key] += int(row["count"])
        agg_urls[key].add(row["SOURCEURL"])

summary_rows = []
for (entity, label), total_mentions in agg_counts.items():
    urls_list = list(agg_urls[(entity, label)])
    summary_rows.append({
        "entity": entity,
        "label": label,
        "total_mentions": total_mentions,
        "url_count": len(urls_list),
        "sample_urls": urls_list[:3],
    })

summary_df = pd.DataFrame(summary_rows).sort_values(
    ["total_mentions", "url_count"], ascending=[False, False]
)
summary_df.to_csv(SUMMARY_CSV, index=False)

print("✅ Summary written:", SUMMARY_CSV)

# Zip everything for easy download
!zip -r /content/entity_output_fast.zip /content/entity_output_fast -q
print("📦 ZIP ready → /content/entity_output_fast.zip")


✅ Summary written: /content/entity_output_fast/entities_summary.csv
📦 ZIP ready → /content/entity_output_fast.zip
