In [1]:
import json
import re
import time
from pathlib import Path
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup

BASE = "https://stardewvalleywiki.com"

HEADERS = {
    "User-Agent": ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                   "KHTML, like Gecko) Chrome/124.0 Safari/537.36"),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
    "Referer": BASE + "/",
    "Upgrade-Insecure-Requests": "1",
}

BANNED_SECTION_TITLES = {"references", "history"}  # H2s to skip (case-insensitive)


# -----------------------------
# Helpers
# -----------------------------

def fetch_html(url: str) -> str:
    """Fetch page HTML with a soft retry on transient codes."""
    s = requests.Session()
    s.headers.update(HEADERS)
    r = s.get(url, timeout=30)
    if r.status_code in (403, 429, 500, 502, 503):
        time.sleep(2.0)
        r = s.get(url, timeout=30)
    r.raise_for_status()
    return r.text

def clean_text(el) -> str:
    """Visible text with collapsed whitespace."""
    return " ".join(el.get_text(" ", strip=True).split())

def sanitize_filename(title: str) -> str:
    s = re.sub(r"[^\w\-_. ]+", "_", title).strip()
    return re.sub(r"\s+", "_", s) or "page"


# -----------------------------
# Section roots & text extraction
# -----------------------------

def get_section_root(soup: BeautifulSoup):
    """Prefer the real content container used by MediaWiki."""
    root = soup.select_one("#mw-content-text .mw-parser-output")
    if not root:
        root = soup.select_one("#mw-content-text")
    return root or soup

def extract_text_from_html_fragment(html_fragment: str) -> str:
    """
    Collect paragraph/list/code text from an HTML fragment string.
    """
    tmp = BeautifulSoup(html_fragment, "html.parser")

    # Remove common clutter within this scope
    for sel in [".toc", ".navbox", ".infobox", ".mw-editsection", "script", "style"]:
        for el in tmp.select(sel):
            el.decompose()

    parts = []
    for el in tmp.find_all(["p", "li", "pre", "code"], recursive=True):
        txt = clean_text(el)
        if txt:
            parts.append(txt)

    text = "\n".join(parts)
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text


# -----------------------------
# Split H2, then flatten H3 into H2 text
# -----------------------------

def split_h2_sections(content_root):
    """
    Returns a list of dicts:
      - {'section_title': 'main', 'nodes': [...]}
      - {'section_title': <H2 title>, 'nodes': [...]}
    """
    sections = []
    current = {"section_title": "main", "nodes": []}

    for child in content_root.children:
        name = getattr(child, "name", None)
        if not name:
            continue  # skip strings/whitespace

        if name.lower() == "h2":
            if current["nodes"]:
                sections.append(current)
            headline = child.select_one(".mw-headline")
            title = clean_text(headline if headline else child) or "Untitled"
            current = {"section_title": title, "nodes": []}
        else:
            current["nodes"].append(child)

    if current["nodes"]:
        sections.append(current)

    if not sections:
        sections = [{"section_title": "main", "nodes": []}]

    return sections

def flatten_h3_into_text(nodes) -> str:
    """
    Given the node list that belongs to one H2 section, produce a single text:
      H2 lead text
      <H3 title>
      <H3 text>
      <next H3 title>
      <next H3 text>
      ...
    """
    # Work over a temporary soup so we can iterate block-level elements
    html_fragment = "".join(str(n) for n in nodes)
    tmp = BeautifulSoup(html_fragment, "html.parser")

    # Split into H2 lead and H3 chunks
    lead_nodes = []
    h3_chunks = []  # list of (title, [nodes])
    current_h3_title = None
    current_h3_nodes = []

    for child in tmp.children:
        name = getattr(child, "name", None)
        if not name:
            # text node/whitespace → attach to current context
            if current_h3_title is None:
                lead_nodes.append(child)
            else:
                current_h3_nodes.append(child)
            continue

        if name.lower() == "h3":
            # Finish previous H3 chunk if any
            if current_h3_title is not None:
                h3_chunks.append((current_h3_title, current_h3_nodes))
            # Start new H3
            headline = child.select_one(".mw-headline")
            current_h3_title = clean_text(headline if headline else child) or "Untitled"
            current_h3_nodes = []
        else:
            if current_h3_title is None:
                lead_nodes.append(child)
            else:
                current_h3_nodes.append(child)

    # Push the last H3 if exists
    if current_h3_title is not None:
        h3_chunks.append((current_h3_title, current_h3_nodes))

    # Build final flattened text
    parts = []

    # H2 lead text
    lead_html = "".join(str(n) for n in lead_nodes)
    lead_text = extract_text_from_html_fragment(lead_html)
    if lead_text:
        parts.append(lead_text)

    # Each H3 title + text
    for title, nodes_list in h3_chunks:
        h3_html = "".join(str(n) for n in nodes_list)
        h3_text = extract_text_from_html_fragment(h3_html)
        if not h3_text:
            continue
        # Add title on its own line to preserve some structure
        parts.append(title)
        parts.append(h3_text)

    text = "\n\n".join(p for p in parts if p.strip())
    return text.strip()


# -----------------------------
# Public API
# -----------------------------

def extraction(url: str, out_path: str | None = None) -> str:
    """
    Extract a Stardew Valley Wiki page into JSON with:
      - title
      - link (page URL only)
      - sections: [{ "section_title": "main"|<H2>, "text": "<flattened text>" }, ...]
    - H3 content is flattened into the parent H2 text with H3 titles inline.
    - Skips H2 sections titled 'References' or 'History' (case-insensitive).
    - Does NOT extract tables or per-section links.
    """
    html = fetch_html(url)
    soup = BeautifulSoup(html, "html.parser")

    # Title
    title_el = soup.select_one("#firstHeading") or soup.select_one("title")
    title = clean_text(title_el) if title_el else urlparse(url).path.rsplit("/", 1)[-1].replace("_", " ") or "Untitled"

    # Section root and pre-clean (page-level)
    content_root = get_section_root(soup)
    for sel in [".catlinks", ".mw-editsection", ".navbox", "script", "style"]:
        for el in content_root.select(sel):
            el.decompose()

    # First split by H2
    h2_sections = split_h2_sections(content_root)

    out = {
        "title": title,
        "link": url,
        "sections": []
    }

    for sec in h2_sections:
        h2_title = sec["section_title"] or "main"

        # Skip banned H2s
        if h2_title.lower() in BANNED_SECTION_TITLES:
            continue

        if h2_title == "main":
            # Everything before first H2 (no H3 here)
            html_fragment = "".join(str(n) for n in sec["nodes"])
            text = extract_text_from_html_fragment(html_fragment)
        else:
            # Flatten H3s into this H2 text
            text = flatten_h3_into_text(sec["nodes"])

        if text:
            out["sections"].append({
                "section_title": "main" if h2_title.lower() == "contents" else h2_title,
                "text": text
            })

    # Decide output path
    if out_path is None:
        fname = sanitize_filename(title) + ".json"
        out_dir = Path("data/json_pages")
        out_dir.mkdir(parents=True, exist_ok=True)
        out_path = str(out_dir / fname)

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    return out_path



In [5]:
# save as: build_single_json.py
# usage:   python build_single_json.py

import json
import os
from pathlib import Path

# from stardew_extract import extraction  # <-- import your function

INPUT_LINKS = "links_clean.txt"
OUTPUT_JSON = "data/stardew_all.json"
PER_PAGE_DIR = Path("data/json_pages")  # where extraction() writes the per-page files

def read_links(path: str) -> list[str]:
    links = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            links.append(line)
    return links

def main():
    # Ensure output directories exist
    PER_PAGE_DIR.mkdir(parents=True, exist_ok=True)
    Path(OUTPUT_JSON).parent.mkdir(parents=True, exist_ok=True)

    links = read_links(INPUT_LINKS)
    print(f"Found {len(links)} links in {INPUT_LINKS}")

    pages = []
    for i, url in enumerate(links[500:], 1):
        try:
            print(f"[{i}/{len(links)}] Extracting: {url}")
            out_path = extraction(url)  # returns the path to the per-page JSON
            # Load the page JSON and append to the combined list
            with open(out_path, "r", encoding="utf-8") as f:
                page_obj = json.load(f)
            pages.append(page_obj)
            time.sleep(0.5)  # be nice to the server
        except Exception as e:
            # Keep going on errors; record a minimal stub for debugging
            print(f"  !! Error on {url}: {e}")
            pages.append({"title": None, "link": url, "sections": [], "error": str(e)})

    # Write the single combined JSON
    combined = {"pages": pages}
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(combined, f, ensure_ascii=False, indent=2)

    print(f"\nWrote combined JSON with {len(pages)} pages to: {OUTPUT_JSON}")

if __name__ == "__main__":
    main()


Found 2821 links in links_clean.txt
[1/2821] Extracting: https://stardewvalleywiki.com/Common_Mushroom
[2/2821] Extracting: https://stardewvalleywiki.com/Community_Center
[3/2821] Extracting: https://stardewvalleywiki.com/Community_Centre
[4/2821] Extracting: https://stardewvalleywiki.com/Community_Upgrade
[5/2821] Extracting: https://stardewvalleywiki.com/Community_center
[6/2821] Extracting: https://stardewvalleywiki.com/Complete_Breakfast
[7/2821] Extracting: https://stardewvalleywiki.com/ConcernedApe
[8/2821] Extracting: https://stardewvalleywiki.com/ConcernedApe_Hat
[9/2821] Extracting: https://stardewvalleywiki.com/Cone_Hat
[10/2821] Extracting: https://stardewvalleywiki.com/Console_Version_History
[11/2821] Extracting: https://stardewvalleywiki.com/Construction_Bundle
[12/2821] Extracting: https://stardewvalleywiki.com/Controls
[13/2821] Extracting: https://stardewvalleywiki.com/Cookie
[14/2821] Extracting: https://stardewvalleywiki.com/Cookies
[15/2821] Extracting: https://star

In [6]:
import json

# Load both files
with open("data/stardew_all.json", "r", encoding="utf-8") as f1, \
     open("data/stardew_all - Copy.json", "r", encoding="utf-8") as f2:
    data1 = json.load(f2)
    data2 = json.load(f1)

# Combine, avoiding duplicate links
seen = set()
combined_pages = []

for page in data1.get("pages", []):
    link = page.get("link")
    if link and link not in seen:
        combined_pages.append(page)
        seen.add(link)

for page in data2.get("pages", []):
    link = page.get("link")
    if link and link not in seen:
        combined_pages.append(page)
        seen.add(link)

# Save to a new file
with open("data/stardew_all_combined.json", "w", encoding="utf-8") as out:
    json.dump({"pages": combined_pages}, out, ensure_ascii=False, indent=2)

print(f"Combined {len(combined_pages)} unique pages.")

Combined 2821 unique pages.
