In [None]:
import requests
import time
import csv
from urllib.parse import quote
from bs4 import BeautifulSoup
import glob

# STEP 1: Extract titles from WikiExtractor files
titles = set()
for filename in glob.glob("extracted_text/*/*.txt"):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()
        soup = BeautifulSoup(content, "html.parser")
        for doc in soup.find_all("doc"):
            titles.add(doc.get("title"))
titles = list(titles)

# STEP 2: Set up output file and API date range
start_date = "20240101"
end_date = "20240601"

with open("pageviews.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["title", "date", "views"])

    for i, title in enumerate(titles):
        safe_title = quote(title.replace(" ", "_"))  # URL encode title
        url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{safe_title}/daily/{start_date}/{end_date}"

        try:
            response = requests.get(url)
            if response.status_code == 200:
                data = response.json()["items"]
                for item in data:
                    writer.writerow([title, item["timestamp"], item["views"]])
            else:
                print(f"[{i}] Skipped: {title} - {response.status_code}")
        except Exception as e:
            print(f"[{i}] Error: {title} - {e}")
        
        time.sleep(0.5)  # Politeness delay to avoid rate limiting
