
#  News → GPT‑5 → JSON (date, title, interest)

Notebook demo: given a URL of a news article, we extract the title and date in local format and ask GPT‑5 to estimate an interest level for the user (0–10) returning a Pydantic-validated object.

**Output:** {"date":"YYYY-MM-DD","title":"...", "interest_level": 0}

> Before running: set the OPENAI_API_KEY environment variable (or paste it in the Config cell).


In [None]:

import os, json, re
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import dateparser
from pydantic import BaseModel, Field, field_validator
from openai import OpenAI

# API key: from env or paste here
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or ""  
assert OPENAI_API_KEY, "Set OPENAI_API_KEY (env or in this cell)."

client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
# Text extraction from URL utilities
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

def fetch_html(url: str, timeout: int = 20) -> str:
    r = requests.get(url, headers=HEADERS, timeout=timeout)
    r.raise_for_status()
    return r.text

def extract_text_snippet(soup, max_chars: int = 2000) -> str:
    # remove non-content elements
    for tag in soup(["script", "style", "nav", "footer", "header", "noscript", "aside"]):
        tag.decompose()
    text = soup.get_text(" ", strip=True)
    text = re.sub(r"\s+", " ", text)
    return text[:max_chars]


In [None]:
# The strcuctured output we want as output from GPT-5
class NewsInterest(BaseModel):
    date: str = Field(description="Date in ISO YYYY-MM-DD format or empty string if not found")
    title: str
    interest_level: int = Field(ge=0, le=10, description="Integer from 0 to 10")

    @field_validator("date")
    @classmethod
    def validate_date(cls, v):
        if not v:
            return v
        # quick ISO format validation
        try:
            datetime.fromisoformat(v)
        except Exception:
            raise ValueError("date must be ISO YYYY-MM-DD or empty")
        return v

def score_with_gpt5(snippet: str, user_interests: str, user_dislike: str) -> NewsInterest:
    system = (
        "You are a system that returns a NewsInterest object with the fields "
        "date, title, interest_level. interest_level is an integer 0–10 representing relevance."
    )
    user = f"""
            USER_INTERESTS: {user_interests}
            USER_DISLIKE: {user_dislike}
            ARTICLE_TEXT_SNIPPET: {snippet}
            CURRENT_DATE: {datetime.now().date().isoformat()}

            Guidelines:
            - 0–2: low relevance; 3–5: medium/broad; 6–8: highly relevant; 9–10: extremely relevant.
            - Consider recency: older articles are less interesting at equal topic relevance.
            """

    # Use model parsing directly (Pydantic) with the Responses API
    resp = client.responses.parse(
        model="gpt-5",
        input=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        text_format=NewsInterest,
    )
    if resp.output_parsed is None:
        raise ValueError("Failed to parse NewsInterest from GPT-5 response.")
    return resp.output_parsed

def analyze_url(url: str, user_interests: str, user_dislike: str) -> dict:
    html = fetch_html(url)
    soup = BeautifulSoup(html, "html5lib")
    snippet = extract_text_snippet(soup)
    parsed = score_with_gpt5(snippet=snippet, user_interests=user_interests, user_dislike=user_dislike)
    return parsed.model_dump()


In [None]:
# Article about AI Datacenter
URL = "https://www.bbc.com/news/articles/ckg2ldpl9leo"

USER_INTERESTS = "AI, cats"
USER_DISLIKE =  "sports"

result = analyze_url(URL, USER_INTERESTS, USER_DISLIKE)
print(json.dumps(result, ensure_ascii=False, indent=2))

In [None]:
# Article about AI Datacenter
URL = "https://www.bbc.com/news/articles/ckg2ldpl9leo"

USER_INTERESTS = "Fashion, cars"
USER_DISLIKE =  "sports"

result = analyze_url(URL, USER_INTERESTS, USER_DISLIKE)
print(json.dumps(result, ensure_ascii=False, indent=2))