In [4]:
from __future__ import annotations

import json
import re
import time
from functools import lru_cache
from pathlib import Path
from typing import List, Tuple

import requests


WIKI_SUMMARY_URL = "https://en.wikipedia.org/api/rest_v1/page/summary/{}"
WIKI_SEARCH_URL = "https://en.wikipedia.org/w/api.php"

# Put something descriptive here (Wikimedia expects this)
WIKI_HEADERS = {
    "User-Agent": "MWAHAHA/1.0 (contact: dardemtum@gmail.com) humor-generation"
}


def normalize_one_line(s: str) -> str:
    s = "" if s is None else str(s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


def safe_word(s: str) -> str:
    """
    Conservative sanitizer for Wikipedia titles:
    - keeps letters, numbers, spaces, hyphens, apostrophes
    - strips everything else
    """
    s = normalize_one_line(s)
    if not s:
        return ""
    s = re.sub(r"[^A-Za-z0-9 \-']", "", s).strip()
    s = re.sub(r"\s+", " ", s).strip()
    return s


def looks_like_disambiguation(extract: str, page_type: str | None) -> bool:
    t = (extract or "").strip().lower()
    if page_type and page_type.strip().lower() == "disambiguation":
        return True
    if "may refer to:" in t:
        return True
    return False


def word_variants_for_wiki(word: str) -> List[str]:
    w = safe_word(word)
    if not w:
        return []

    variants = [w]
    wl = w.lower()

    # all caps noise
    if w.isupper() and len(w) >= 4:
        variants.append(w.title())
        variants.append(w.lower())

    # bullies -> bully
    if wl.endswith("ies") and len(wl) > 4:
        variants.append(w[:-3] + "y")

    # watches -> watch, boxes -> box
    if wl.endswith("es") and len(wl) > 4:
        variants.append(w[:-2])

    # astronauts -> astronaut
    if wl.endswith("s") and len(wl) > 3 and not wl.endswith("ss"):
        variants.append(w[:-1])

    # deduplicate case-insensitive
    seen = set()
    out = []
    for v in variants:
        key = v.lower()
        if key in seen:
            continue
        seen.add(key)
        out.append(v)
    return out


def fetch_summary(title: str) -> Tuple[str, str]:
    t = safe_word(title)
    if not t:
        return ("", "")

    url = WIKI_SUMMARY_URL.format(requests.utils.quote(t))
    try:
        r = requests.get(url, headers=WIKI_HEADERS, timeout=12)
        if r.status_code != 200:
            return ("", "")
        data = r.json()
        extract = normalize_one_line(data.get("extract", "") or "")
        page_type = normalize_one_line(data.get("type", "") or "")
        return (extract, page_type)
    except Exception:
        return ("", "")


@lru_cache(maxsize=4096)
def search_titles(query: str) -> List[str]:
    q = safe_word(query)
    if not q:
        return []

    params = {
        "action": "query",
        "list": "search",
        "srsearch": q,
        "format": "json",
        "utf8": 1,
        "srlimit": 6,
    }
    try:
        r = requests.get(WIKI_SEARCH_URL, params=params, headers=WIKI_HEADERS, timeout=12)
        if r.status_code != 200:
            return []
        data = r.json()
        items = (data.get("query", {}) or {}).get("search", []) or []
        titles = []
        for it in items:
            title = normalize_one_line(it.get("title", "") or "")
            if title:
                titles.append(title)
        return titles
    except Exception:
        return []


@lru_cache(maxsize=8192)
def get_wikipedia_extract_cached(word: str) -> str:
    w = safe_word(word)
    if not w:
        return ""

    # 1) direct try (with variants)
    for cand in word_variants_for_wiki(w):
        extract, page_type = fetch_summary(cand)
        if not extract:
            continue
        if looks_like_disambiguation(extract, page_type):
            continue
        return extract

    # 2) resolve via search (skip obvious disambiguation titles)
    for q in word_variants_for_wiki(w):
        for title in search_titles(q):
            if title.lower().endswith("(disambiguation)"):
                continue
            extract, page_type = fetch_summary(title)
            if not extract:
                continue
            if looks_like_disambiguation(extract, page_type):
                continue
            return extract

    return ""


In [5]:
CACHE_IN = Path("wiki_extract_cache.json")
CACHE_OUT = Path("wiki_extract_cache_updated.json")

extracts = json.loads(CACHE_IN.read_text(encoding="utf-8"))
print("Loaded cache entries:", len(extracts))

def is_bad_extract(x: str) -> bool:
    t = (x or "").strip().lower()
    return (t == "") or ("may refer to:" in t)

bad_words = [w for w, ex in extracts.items() if is_bad_extract(ex)]
print("Bad entries to rerun:", len(bad_words))
print("Sample bad words:", bad_words[:30])


Loaded cache entries: 8892
Bad entries to rerun: 2711
Sample bad words: ['AAA', 'ACKBAR', 'AHAHAHAHAHAHAHAHAHA', 'AMA', 'ANGRY', 'APR', 'ARRRGHH', 'ARRRRRR', 'ATV', 'AYE', 'AYYEEEE', 'Aaaand', 'Abe', 'Abu', 'Accord', 'Act', 'Adbl', 'Addictionary', 'Advi', 'Age', 'Ages', 'Ajit', 'Alan', 'Albanian', 'Albert', 'Alcohol', 'Alentine', 'Alexa', 'Alfred', 'Algaebra']


In [7]:
import time
import json
from pathlib import Path

# Inputs/outputs
CACHE_IN = Path("wiki_extract_cache.json")
RERUN_RESULTS_JSON = Path("wiki_rerun_results.json")
RERUN_LOG_TSV = Path("wiki_rerun_log.tsv")
CACHE_OUT = Path("wiki_extract_cache_updated.json")

SLEEP_SECONDS = 0.20
PRINT_EVERY = 50

extracts = json.loads(CACHE_IN.read_text(encoding="utf-8"))

def is_bad_extract(x: str) -> bool:
    t = (x or "").strip().lower()
    return (t == "") or ("may refer to:" in t)

bad_words = [w for w, ex in extracts.items() if is_bad_extract(ex)]
print("Loaded cache entries:", len(extracts))
print("Bad entries to rerun:", len(bad_words))


def word_variants(word: str) -> list[str]:
    """
    Generates a small set of normalized variants to try.
    We will use these variants as:
    - direct summary title queries
    - search queries (resolver)
    """
    w = safe_word(word)
    if not w:
        return []

    vars_ = [w]

    # lower/title variants (helps for ALL CAPS or weird casing)
    vars_.append(w.lower())
    vars_.append(w.title())

    wl = w.lower()

    # plural / inflection heuristics
    if wl.endswith("ies") and len(wl) > 4:
        vars_.append(w[:-3] + "y")

    if wl.endswith("es") and len(wl) > 4:
        vars_.append(w[:-2])

    if wl.endswith("s") and len(wl) > 3 and not wl.endswith("ss"):
        vars_.append(w[:-1])

    # de-duplicate case-insensitive
    seen = set()
    out = []
    for v in vars_:
        v2 = safe_word(v)
        if not v2:
            continue
        key = v2.lower()
        if key in seen:
            continue
        seen.add(key)
        out.append(v2)
    return out


def get_extract_with_trace(word: str) -> dict:
    """
    Returns a trace dict:
      {
        "input_word": ...,
        "normalized_variants": [...],
        "best_extract": "...",
        "best_title": "...",
        "method": "direct|search|none",
        "status": "ok|empty|disambiguation",
      }
    """
    variants = word_variants(word)
    trace = {
        "input_word": word,
        "normalized_variants": variants,
        "best_extract": "",
        "best_title": "",
        "method": "none",
        "status": "empty",
    }
    if not variants:
        return trace

    # 1) Direct attempts with variants
    for cand in variants:
        ex, page_type = fetch_summary(cand)
        ex = normalize_one_line(ex)
        if not ex:
            continue
        if looks_like_disambiguation(ex, page_type):
            trace["status"] = "disambiguation"
            continue

        trace["best_extract"] = ex
        trace["best_title"] = cand
        trace["method"] = "direct"
        trace["status"] = "ok"
        return trace

    # 2) Search resolver attempts
    # Use each variant as a query; pick first non-disambiguation title with a real extract
    for q in variants:
        titles = search_titles(q)
        for title in titles:
            if title.lower().endswith("(disambiguation)"):
                continue
            ex, page_type = fetch_summary(title)
            ex = normalize_one_line(ex)
            if not ex:
                continue
            if looks_like_disambiguation(ex, page_type):
                trace["status"] = "disambiguation"
                continue

            trace["best_extract"] = ex
            trace["best_title"] = title
            trace["method"] = "search"
            trace["status"] = "ok"
            return trace

    # nothing found
    return trace


# Rerun and store separate results
rerun_results = {}
log_rows = []
updated = 0
still_bad = 0

t0 = time.time()
for i, w in enumerate(bad_words, start=1):
    old_ex = normalize_one_line(extracts.get(w, ""))

    trace = get_extract_with_trace(w)
    new_ex = trace["best_extract"]

    # Save rerun result separately regardless of whether we update the main cache
    rerun_results[w] = trace

    # Decide if we should update main cache:
    # update only if new extract is:
    # - non-empty
    # - not disambiguation
    # - longer than old extract
    should_update = bool(new_ex) and ("may refer to:" not in new_ex.lower()) and (len(new_ex) > len(old_ex))

    if should_update:
        extracts[w] = new_ex
        updated += 1
        action = "UPDATED"
    else:
        still_bad += 1
        action = "SKIPPED"

    log_rows.append({
        "word": w,
        "action": action,
        "old_len": len(old_ex),
        "new_len": len(new_ex),
        "method": trace["method"],
        "best_title": trace["best_title"],
        "status": trace["status"],
        "old_extract_preview": old_ex[:120],
        "new_extract_preview": new_ex[:120],
    })

    if i % PRINT_EVERY == 0:
        elapsed = time.time() - t0
        print(f"{i}/{len(bad_words)} rerun (elapsed {elapsed:.1f}s). updated={updated}, skipped={still_bad}. example={w!r}")
        if trace["best_title"]:
            print("  best_title:", trace["best_title"])
        if new_ex:
            print("  new:", new_ex[:200])

    time.sleep(SLEEP_SECONDS)

# Save rerun-only results (for later inspection)
RERUN_RESULTS_JSON.write_text(json.dumps(rerun_results, ensure_ascii=False, indent=2), encoding="utf-8")
print("Saved rerun results:", RERUN_RESULTS_JSON.resolve())

# Save human-readable log
import pandas as pd
pd.DataFrame(log_rows).to_csv(RERUN_LOG_TSV, sep="\t", index=False)
print("Saved rerun log:", RERUN_LOG_TSV.resolve())

# Save updated cache as a new file (does not overwrite original)
CACHE_OUT.write_text(json.dumps(extracts, ensure_ascii=False, indent=2), encoding="utf-8")
print("\nDone.")
print("Updated entries:", updated)
print("Skipped (still bad or not improved):", still_bad)
print("Saved updated cache:", CACHE_OUT.resolve())


Loaded cache entries: 8892
Bad entries to rerun: 2711
50/2711 rerun (elapsed 37.5s). updated=40, skipped=10. example='Areptile'
100/2711 rerun (elapsed 144.4s). updated=85, skipped=15. example='Belle'
  best_title: Belle Delphine
  new: Mary-Belle Kirschner, better known as Belle Delphine, is a South African-born British social media personality, pornographic actress, model, and YouTuber. Her social media accounts feature erotic and 
150/2711 rerun (elapsed 244.0s). updated=131, skipped=19. example='CASHEWWWW'
200/2711 rerun (elapsed 325.1s). updated=169, skipped=31. example='Clop'
  best_title: CLOP
  new: CLOP is a 2012 Flash game made by Australian game designer Bennett Foddy. It is considered a spiritual successor to Foddy's previous game, QWOP, released four years prior which had gone on to become a
250/2711 rerun (elapsed 422.7s). updated=213, skipped=37. example='Dalai'
  best_title: 14th Dalai Lama
  new: The 14th Dalai Lama is the incumbent Dalai Lama, the highest spiritual le

In [8]:
import json
import re
from pathlib import Path

# Use your actual filenames
ORIG_CACHE = Path("wiki_extract_cache.json")
UPDATED_CACHE = Path("wiki_extract_cache_updated.json")

CLEAN_UPDATED_OUT = Path("wiki_extract_cache_updated_cleaned.json")
MERGED_OUT = Path("wiki_extract_cache_merged.json")

def normalize_one_line(s: str) -> str:
    s = "" if s is None else str(s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def is_disambiguation(s: str) -> bool:
    t = normalize_one_line(s).lower()
    if not t:
        return False
    return ("may refer to" in t) or ("commonly refers to" in t)

# Load
orig = json.loads(ORIG_CACHE.read_text(encoding="utf-8"))
upd = json.loads(UPDATED_CACHE.read_text(encoding="utf-8"))

print("Loaded original:", len(orig), "entries from", ORIG_CACHE.resolve())
print("Loaded updated :", len(upd),  "entries from", UPDATED_CACHE.resolve())

# 1) Clean updated: blank out disambiguation extracts
cleaned_upd = {}
blanked = 0
for k, v in upd.items():
    v_norm = normalize_one_line(v)
    if is_disambiguation(v_norm):
        cleaned_upd[k] = ""
        blanked += 1
    else:
        cleaned_upd[k] = v_norm

CLEAN_UPDATED_OUT.write_text(json.dumps(cleaned_upd, ensure_ascii=False, indent=2), encoding="utf-8")
print("\nSaved cleaned updated cache:", CLEAN_UPDATED_OUT.resolve())
print("Blanked disambiguation entries:", blanked)

# 2) Merge into original: replace only when updated value is non-empty
merged = dict(orig)
replaced = 0

for k, v_new in cleaned_upd.items():
    if not v_new:
        continue
    if k in merged and normalize_one_line(merged[k]) != v_new:
        merged[k] = v_new
        replaced += 1

MERGED_OUT.write_text(json.dumps(merged, ensure_ascii=False, indent=2), encoding="utf-8")
print("\nSaved merged cache:", MERGED_OUT.resolve())
print("Replaced entries:", replaced)


Loaded original: 8892 entries from /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/sft_prep/wiki_extract_cache.json
Loaded updated : 8892 entries from /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/sft_prep/wiki_extract_cache_updated.json

Saved cleaned updated cache: /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/sft_prep/wiki_extract_cache_updated_cleaned.json
Blanked disambiguation entries: 394

Saved merged cache: /Users/andrey/Documents/_Artemis_tum/Semester5/MWAHAHA_Competition/sft_prep/wiki_extract_cache_merged.json
Replaced entries: 2284
