In [None]:
import requests
import time
import csv
from urllib.parse import quote

# Load titles from your earlier step
titles = list(titles)  # already extracted from WikiExtractor

# Optional: limit to first N for testing
# titles = titles[:50]

# Output CSV
with open("edit_history.csv", "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["title", "timestamp", "user", "comment", "size"])

    for i, title in enumerate(titles):
        print(f"[{i}] Processing: {title}")
        safe_title = title.replace(" ", "_")

        # Initial query
        base_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "prop": "revisions",
            "titles": safe_title,
            "rvlimit": "500",
            "rvprop": "timestamp|user|comment|size",
            "format": "json"
        }

        has_more = True
        rvcontinue = None

        while has_more:
            if rvcontinue:
                params["rvcontinue"] = rvcontinue

            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                print(f"  ✖ Failed: {response.status_code}")
                break

            data = response.json()
            pages = data.get("query", {}).get("pages", {})

            for page_id, page_data in pages.items():
                if "revisions" in page_data:
                    for rev in page_data["revisions"]:
                        writer.writerow([
                            title,
                            rev.get("timestamp"),
                            rev.get("user", "N/A"),
                            rev.get("comment", "N/A"),
                            rev.get("size", "N/A")
                        ])

            rvcontinue = data.get("continue", {}).get("rvcontinue")
            has_more = bool(rvcontinue)

            time.sleep(0.5)  # Polite pause to avoid rate-limiting
