In [None]:
# file: scrape_valorant_patch_notes.py
import asyncio
import json
import re
import time
from pathlib import Path
from typing import List, Dict, Optional

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

from playwright.async_api import async_playwright

BASE = "https://playvalorant.com"
ARCHIVE_PATH = "/{locale}/news/tags/patch-notes/"
LOCALE = "en-us"  # change to your preferred locale, e.g., "en-gb", "es-es", "ja-jp"
ARCHIVE_URL = f"{BASE}{ARCHIVE_PATH.format(locale=LOCALE)}"

OUT_JSONL = Path("valorant_patch_notes.jsonl")
OUT_CSV   = Path("valorant_patch_notes.csv")

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; ValorantPatchScraper/1.0; +https://example.com)"
}

def extract_patch_number(title: str) -> Optional[str]:
    # Tries to find things like "Patch Notes 8.08" / "11.04" / "v1.0"
    m = re.search(r'(\b(?:v)?\d+(?:\.\d+){0,2}\b)', title, flags=re.IGNORECASE)
    return m.group(1) if m else None

def parse_article(url: str) -> Dict:
    """Download and parse a single patch notes page."""
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # Title
    title_el = soup.find("h1")
    title = title_el.get_text(strip=True) if title_el else ""

    # Date (often found in time tag or meta)
    date_text = ""
    time_el = soup.find("time")
    if time_el and time_el.get("datetime"):
        date_text = time_el["datetime"]
    elif time_el:
        date_text = time_el.get_text(strip=True)
    else:
        # Fallback: try meta tags
        meta_date = soup.find("meta", {"property": "article:published_time"})
        if meta_date and meta_date.get("content"):
            date_text = meta_date["content"]

    # Article body: primary selector commonly used on Riot news pages
    paragraphs = []
    body = soup.select_one("div[itemprop='articleBody']") or soup.select_one("article")
    if body:
        for p in body.find_all(["p", "li"]):
            text = p.get_text(" ", strip=True)
            if text:
                paragraphs.append(text)
    else:
        # Last resort: all paragraphs on page
        for p in soup.find_all("p"):
            text = p.get_text(" ", strip=True)
            if text:
                paragraphs.append(text)

    content = "\n".join(paragraphs).strip()

    return {
        "title": title,
        "url": url,
        "published": date_text,
        "patch_number": extract_patch_number(title) or "",
        "content": content
    }

async def collect_archive_links() -> List[str]:
    """Use Playwright to scroll the archive and collect all article links."""
    links = set()

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent=HEADERS["User-Agent"])
        page = await context.new_page()
        await page.goto(ARCHIVE_URL, wait_until="domcontentloaded")

        last_height = 0
        same_height_hits = 0

        # Scroll until height stops increasing several times (archive fully loaded)
        while True:
            # Grab article card links currently in DOM
            anchors = await page.locator("a:visible").all()
            for a in anchors:
                try:
                    href = await a.get_attribute("href")
                    if not href:
                        continue
                    if "/news/" in href and href.startswith("/"):
                        links.add(BASE + href)
                    elif href.startswith(BASE) and "/news/" in href:
                        links.add(href)
                except Exception:
                    pass

            # Auto-scroll
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await page.wait_for_timeout(800)

            # Check if we loaded more
            height = await page.evaluate("document.body.scrollHeight")
            if height == last_height:
                same_height_hits += 1
            else:
                same_height_hits = 0
            last_height = height

            # Stop after several no-growth checks
            if same_height_hits >= 4:
                break

        await browser.close()

    # Filter to tag pages only (defensive)
    deduped = sorted(set(l for l in links if "/news/" in l))
    return deduped

def save_jsonl(rows: List[Dict], path: Path):
    with path.open("w", encoding="utf-8") as f:
        for row in rows:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

def main():
    print(f"[1/3] Collecting patch links from: {ARCHIVE_URL}")
    all_links = asyncio.run(collect_archive_links())
    # Keep only patch-notes pages (defensive filter)
    patch_links = [u for u in all_links if "/news/" in u and "patch-notes" in u.lower()]

    print(f"Found {len(all_links)} news links, {len(patch_links)} look like patch notes.")

    results = []
    seen = set()

    print("[2/3] Fetching and parsing patch pages…")
    for url in tqdm(sorted(set(patch_links))):
        if url in seen:
            continue
        seen.add(url)
        try:
            item = parse_article(url)
            # Only keep if it has some content
            if item["title"] and item["content"]:
                results.append(item)
            # Be polite
            time.sleep(0.6)
        except Exception as e:
            print(f"Error parsing {url}: {e}")

    print(f"[3/3] Saving {len(results)} patch notes to files…")
    save_jsonl(results, OUT_JSONL)
    pd.DataFrame(results).to_csv(OUT_CSV, index=False)

    print(f"Done.\n- JSONL: {OUT_JSONL.resolve()}\n- CSV:   {OUT_CSV.resolve()}")

if __name__ == "__main__":
    main()
