In [None]:
# --- Setup / Installs ---
!pip install --upgrade pip --quiet
!pip install requests pandas tqdm python-dateutil dateparser lxml_html_clean newspaper3k --quiet

import os, time, random, math, urllib.parse
from datetime import datetime, timedelta, timezone
import requests
import pandas as pd
from tqdm import tqdm
from dateutil import parser as duparser
import dateparser

os.makedirs('logs', exist_ok=True)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.8 MB[0m [31m10.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m25.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[33m  DEPRECATION: Building 'tinysegmenter' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possib

In [None]:
# --- Config ---
# Guardian API key (free): https://open-platform.theguardian.com/access/
GUARDIAN_API_KEY = "e1cc4651-7c39-443d-a5c0-b35dbed81e3e"  # <= put your key here

START_DATE = "2020-01-01"  # historical range supported by Guardian API
END_DATE   = datetime.now(timezone.utc).strftime("%Y-%m-%d")

KEYWORDS = {
    "muslim":    ["muslim", "islam", "islamic", "mosque", "hijab", "halal", "imam", "ramadan", "eid"],
    "christian": ["christian", "church", "priest", "bible", "archbishop"],
    "jewish":    ["jewish", "jew", "synagogue", "rabbi", "kosher", "yom kippur"]
}

# Optional: GDELT top-up (last 90 days only)
USE_GDELT_TOPUP = True
GDELT_DOC_API = "https://api.gdeltproject.org/api/v2/doc/doc"
UK_ALLOWLIST = {
    'bbc.co.uk': 'BBC News', 'bbc.com': 'BBC News',
    'theguardian.com': 'The Guardian',
    'independent.co.uk': 'The Independent',
    'telegraph.co.uk': 'The Telegraph',
    'dailymail.co.uk': 'Daily Mail',
    'thesun.co.uk': 'The Sun',
    'news.sky.com': 'Sky News'
}


In [None]:
# --- Guardian helpers ---
GUARDIAN_SEARCH_URL = "https://content.guardianapis.com/search"

def guardian_search(q, from_date, to_date, page=1, page_size=50):
    if not GUARDIAN_API_KEY or GUARDIAN_API_KEY == "PASTE_YOUR_KEY_HERE":
        raise ValueError("Please set GUARDIAN_API_KEY in the Config cell.")
    params = {
        "q": q,
        "from-date": from_date,
        "to-date": to_date,
        "page": page,
        "page-size": page_size,
        "order-by": "newest",
        "show-fields": "bodyText,headline",
        "api-key": GUARDIAN_API_KEY
    }
    r = requests.get(GUARDIAN_SEARCH_URL, params=params, timeout=30)
    r.raise_for_status()
    return r.json()

def guardian_collect_for_terms(terms, start_date, end_date, faith_group, max_pages=250):
    """Collect Guardian articles for a list of terms over date range, paginating."""
    q = " OR ".join([f'"{t}"' if " " in t else t for t in terms])
    first = guardian_search(q, start_date, end_date, page=1)
    resp = first.get("response", {})
    pages = resp.get("pages", 1)
    total = resp.get("total", 0)
    pages = min(pages, max_pages)
    rows = []

    print(f"Guardian: '{faith_group}' → total matches reported: {total}, pages pulled: {pages}")
    for p in tqdm(range(1, pages + 1)):
        data = first if p == 1 else guardian_search(q, start_date, end_date, page=p)
        results = data.get("response", {}).get("results", [])
        for it in results:
            url = it.get("webUrl")
            web_title = it.get("webTitle")
            fields = it.get("fields") or {}
            body = fields.get("bodyText", "")
            headline = fields.get("headline") or web_title
            pub = it.get("webPublicationDate")
            rows.append({
                "source": "The Guardian",
                "title": headline,
                "content": body,
                "date": pub,
                "url": url,
                "faith_group": faith_group
            })
        time.sleep(random.uniform(0.2, 0.5))
    return rows


In [None]:
# --- Collect Guardian articles 2020 → today for all groups ---
all_guardian_rows = []
for fg, terms in KEYWORDS.items():
    rows = guardian_collect_for_terms(terms, START_DATE, END_DATE, fg, max_pages=250)
    all_guardian_rows.extend(rows)

guard_df = pd.DataFrame(all_guardian_rows)
print("Guardian rows (raw):", len(guard_df))

# Clean & dedupe
guard_df["date"] = pd.to_datetime(guard_df["date"], errors="coerce")
if len(guard_df):
    guard_df = guard_df.dropna(subset=["url", "title", "content"]).drop_duplicates("url")
    guard_df = guard_df.sort_values("date").reset_index(drop=True)
print("Guardian rows (clean):", len(guard_df))
guard_df.head(3)


Guardian: 'muslim' → total matches reported: 14852, pages pulled: 250


100%|██████████| 250/250 [06:38<00:00,  1.59s/it]


Guardian: 'christian' → total matches reported: 9700, pages pulled: 194


100%|██████████| 194/194 [04:51<00:00,  1.50s/it]


Guardian: 'jewish' → total matches reported: 10124, pages pulled: 203


100%|██████████| 203/203 [05:25<00:00,  1.60s/it]


Guardian rows (raw): 32324
Guardian rows (clean): 29311


Unnamed: 0,source,title,content,date,url,faith_group
0,The Guardian,Anti- and pro-Brexit campaign heads join open ...,The former directors of both the leave and rem...,2020-01-01 00:01:49+00:00,https://www.theguardian.com/politics/2020/jan/...,jewish
1,The Guardian,"Let's resolve to reconnect, says Welby in new ...",The archbishop of Canterbury will urge people ...,2020-01-01 00:01:49+00:00,https://www.theguardian.com/uk-news/2020/jan/0...,christian
2,The Guardian,Messiah review – it's Homeland ... with a divi...,Is he the messiah? Or is he a very naughty boy...,2020-01-01 06:00:56+00:00,https://www.theguardian.com/tv-and-radio/2020/...,christian


In [None]:
# --- Keyword flags + save ---
def detect_hits(text, terms):
    t = (text or "").lower()
    return ";".join(sorted({w for w in terms if w.lower() in t}))

if len(guard_df):
    guard_df["keywords_detected"] = guard_df.apply(
        lambda r: detect_hits(f"{r['title']} {r['content']}", KEYWORDS.get(r["faith_group"], [])),
        axis=1
    )
    guardian_path = "media_articles_guardian_2020_2025.csv"
    guard_df.to_csv(guardian_path, index=False)
    print(f"Saved Guardian dataset → {guardian_path} ({len(guard_df)} rows)")
else:
    print("No Guardian rows to save. Check API key or queries.")


Saved Guardian dataset → media_articles_guardian_2020_2025.csv (29311 rows)
