In [1]:
from dotenv import load_dotenv
from pathlib import Path
import requests
import os
import re

In [None]:
# Get project root dynamically (parent of ntb directory)
project_root = Path.cwd().parent
env_path = project_root / ".venv" / ".env"
# Fallback to .env in project root if .venv/.env doesn't exist
if not env_path.exists():
    env_path = project_root / ".env"
load_dotenv(dotenv_path=str(env_path))
guardian_key = os.environ.get("GUARDIAN_APIKEY")

In [None]:
def download_guardian_articles(nb_articles, sujet):
    # Get project root dynamically (parent of ntb directory)
    project_root = Path.cwd().parent
    DATA_DIR = project_root / "data" / "raw"
    DATA_DIR.mkdir(parents=True, exist_ok=True)

    url = "https://content.guardianapis.com/search"

    params = {
        "q": sujet,
        "page-size": nb_articles,
        "api-key": guardian_key,
        "show-fields": "headline,body,byline"
    }

    response = requests.get(url, params=params)
    response.raise_for_status()
    results = response.json().get("response", {}).get("results", [])

    def slugify(text):
        text = text.strip().lower()
        text = re.sub(r'[^a-z0-9_\-]+', '-', text)
        text = re.sub(r'-{2,}', '-', text)
        return text.strip('-')

    for article in results:
        fields = article.get('fields', {})
        titre = fields.get('headline', 'Sans titre')
        body = fields.get('body', '')
        date = article.get('webPublicationDate', 'Date inconnue')
        author = fields.get('byline', 'Auteur inconnu')
        journal = article.get('sectionName', 'The Guardian')

        meta_info = (
            f"<div>"
            f"<strong>Date :</strong> {date} &nbsp; | &nbsp; "
            f"<strong>Auteur :</strong> {author} &nbsp; | &nbsp; "
            f"<strong>Journal :</strong> {journal}</div>\n"
        )

        html_content = f"<h1>{titre}</h1>\n{meta_info}{body}"

        filename = f"{slugify(titre)}.html"
        filepath = Path(DATA_DIR) / filename

        if filepath.exists():
            print(f"[SKIP] Article déjà présent : {filename}")
            continue

        with open(filepath, "w", encoding="utf-8") as f:
            f.write(html_content)

In [32]:
download_guardian_articles(nb_articles=200, sujet="")

[SKIP] Article déjà présent : harry-potter-and-the-4-50-chocolate-frog.html
[SKIP] Article déjà présent : what-if-the-interstellar-body-oumuamua-really-was-sent-by-aliens.html
[SKIP] Article déjà présent : how-new-moms-are-supported-or-not-in-france-v-the-us-a-feminist-cartoon.html
[SKIP] Article déjà présent : wet-docks-giant-ducks-and-the-zero-waste-city-the-best-architecture-and-design-of-2018.html
[SKIP] Article déjà présent : why-medical-students-are-practicing-abortions-on-papayas.html
[SKIP] Article déjà présent : atl-tico-get-back-to-what-they-know-and-make-la-liga-a-four-horse-race.html
[SKIP] Article déjà présent : the-house-of-government-by-yuri-slezkine-review-the-russian-revolution-told-through-one-building.html
[SKIP] Article déjà présent : theresa-may-puts-tackling-climate-change-back-on-tory-agenda.html
[SKIP] Article déjà présent : vladimir-putin-makes-triumphant-visit-to-syria-airbase.html
[SKIP] Article déjà présent : gardens-what-to-do-this-week.html
[SKIP] Article 