In [2]:
import requests
import re
import pandas as pd
import time

# ------------------------------------------------------------
# Helper: safe GET request with JSON protection
# ------------------------------------------------------------
def safe_get(url, params=None, retries=3, delay=1):
    for _ in range(retries):
        try:
            r = requests.get(
                url, 
                params=params, 
                headers={"User-Agent": "Sample-Pizza-Checker/1.0"},
                timeout=5
            )
            return r.json()   # if this fails, it will jump to except
        except:
            time.sleep(delay)
    return None  # completely failed


# ------------------------------------------------------------
# Get a random article (title + pageid)
# ------------------------------------------------------------
def get_random_page():
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "list": "random",
        "rnnamespace": 0,
        "rnlimit": 1,
        "format": "json"
    }

    data = safe_get(url, params=params)
    if data is None:
        return None, None

    page = data["query"]["random"][0]
    return page["title"], page["id"]


# ------------------------------------------------------------
# Get article text
# ------------------------------------------------------------
def get_page_text(pageid):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "pageids": pageid,
        "prop": "extracts",
        "explaintext": True,
        "format": "json"
    }

    data = safe_get(url, params=params)
    if data is None:
        return ""

    return data["query"]["pages"][str(pageid)].get("extract", "")


# ------------------------------------------------------------
# Sample 10 articles safely
# ------------------------------------------------------------
results = []

for i in range(10):
    title, pageid = get_random_page()

    # If Wikipedia failed to return JSON, retry
    while title is None:
        time.sleep(1)
        title, pageid = get_random_page()

    text = get_page_text(pageid)

    contains_pizza = bool(re.search(r"\bpizza\b", text, re.IGNORECASE))

    results.append({
        "sample_number": i + 1,
        "title": title,
        "contains_pizza": contains_pizza
    })

# ------------------------------------------------------------
# Make DataFrame + compute proportion
# ------------------------------------------------------------
df = pd.DataFrame(results)
df["sample_proportion"] = df["contains_pizza"].mean()

df


Unnamed: 0,sample_number,title,contains_pizza,sample_proportion
0,1,"The Walk, Dubai",False,0.0
1,2,The Hep Stars (album),False,0.0
2,3,Athletics at the 2006 Commonwealth Games – Men...,False,0.0
3,4,Eddie Meador,False,0.0
4,5,Pioneers of Alaska,False,0.0
5,6,John Gyetuah,False,0.0
6,7,"West Grove, Indiana",False,0.0
7,8,Julian Paget,False,0.0
8,9,Hannya,False,0.0
9,10,Lee McConnell,False,0.0
