In [None]:
#!/usr/bin/env python3
"""
Fetch BBC News across all categories, then visit each article
and extract headline, description, section, content_topic, authors,
publish & modify times, image URLs, and full content,
then export all results into a CSV file, with a progress bar for each category.
Improved network resilience via retries/backoff to avoid SSL/network errors.
"""

import time
import json
import re
import csv
import random
import requests
from requests.exceptions import RequestException, SSLError
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from bs4 import BeautifulSoup
import bbc_feeds
from tqdm import tqdm

# --- Configure a session with retries/backoff ---
session = requests.Session()
retries = Retry(
    total=5,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
    raise_on_status=False
)
adapter = HTTPAdapter(max_retries=retries)
session.mount('https://', adapter)
session.mount('http://', adapter)

HEADERS = {
    "User-Agent": "bbc-news-scraper/1.0 (+https://yourdomain.example)"
}

OUTPUT_CSV = "bbc_news_articles.csv"


def parse_article(url):
    try:
        resp = session.get(url, headers=HEADERS, timeout=10, verify=True)
        resp.raise_for_status()
    except (RequestException, SSLError) as e:
        print(f"[Error] Failed to fetch {url}: {e}. Retrying... ")
        # final retry after backoff
        try:
            time.sleep(2)
            resp = session.get(url, headers=HEADERS, timeout=20, verify=True)
            resp.raise_for_status()
        except Exception as e2:
            # Give up after retries
            print(f"[Fatal] Could not fetch {url} after retries: {e2}")
            return None

    soup = BeautifulSoup(resp.text, 'html.parser')

    # --- 1. Grab JSON-LD if present ---
    data = {}
    ld = soup.find("script", type="application/ld+json")
    if ld and ld.string:
        try:
            data = json.loads(ld.string)
        except json.JSONDecodeError:
            pass

    # --- Try to pull BBC dotcomConfig JSON for section & topics ---
    bbc_cfg = {}
    cfg_script = soup.find('script', id='inline-dotcom-config')
    if cfg_script and cfg_script.string:
        match = re.search(r"window\.dotcomConfig\s*=\s*(\{.*\});", cfg_script.string, re.DOTALL)
        if match:
            try:
                bbc_cfg = json.loads(match.group(1))
            except json.JSONDecodeError:
                bbc_cfg = {}

    props = bbc_cfg.get('pageData', {}).get('pageProps', {})

    # --- 2. Headline & Description ---
    headline = data.get("headline") or props.get('page_title') or (soup.title.string.strip() if soup.title else "N/A")
    description = data.get("description") or props.get('description') or ''
    if not description:
        desc_meta = soup.find("meta", attrs={"name": "description"})
        description = desc_meta["content"] if desc_meta else ''

    # --- 3. Section ---
    section = data.get("articleSection") or props.get('content_section')
    if not section:
        sec_meta = soup.find("meta", property="article:section")
        section = sec_meta["content"] if sec_meta else 'N/A'

    # --- 4. Content topics / keywords ---
    content_topic = data.get("keywords") or props.get('content_topic')
    if not content_topic:
        kw = (soup.find("meta", attrs={"name": "news_keywords"}) or 
              soup.find("meta", attrs={"name": "keywords"}))
        content_topic = kw["content"] if kw and kw.has_attr("content") else ''

    # --- 5. Authors (string) ---
    authors_list = []
    auth = data.get("author")
    if auth:
        if isinstance(auth, list):
            for a in auth:
                if isinstance(a, dict) and a.get("name"):
                    authors_list.append(a["name"])
                else:
                    authors_list.append(str(a))
        elif isinstance(auth, dict) and auth.get("name"):
            authors_list.append(auth["name"])
        else:
            authors_list = [str(auth)]
    else:
        if props.get('byline'):
            authors_list = [props['byline']]
        else:
            am = soup.find("meta", property="article:author")
            if am and am.has_attr("content"):
                authors_list = [am["content"]]
    authors = ", ".join(authors_list)

    # --- 6. Published & Modified times ---
    published = data.get("datePublished") or props.get('date_published') or ''
    date_modified = data.get("dateModified") or props.get('date_modified') or ''

    # --- 7. Images ---
    img = data.get("image", {})
    if isinstance(img, dict):
        image_url = img.get("url", '')
    elif isinstance(img, list) and img:
        image_url = img[0].get("url", '')
    else:
        image_url = props.get('image', '')
    if not image_url:
        og = soup.find("meta", property="og:image")
        image_url = og["content"] if og and og.has_attr("content") else ''

    thumbnail_url = data.get("thumbnailUrl") or props.get('thumbnailUrl') or ''
    if not thumbnail_url:
        tw = soup.find("meta", property="twitter:image:src")
        thumbnail_url = tw["content"] if tw and tw.has_attr("content") else ''

    # --- 8. mainEntityOfPage ---
    main_entity = data.get("mainEntityOfPage") or props.get('mainEntityOfPage') or ''

    # --- 9. Full text content ---
    paragraphs = []
    for block in soup.find_all('div', {'data-component': 'text-block'}):
        for p in block.find_all('p'):
            paragraphs.append(p.get_text(strip=True))
    content = "\n\n".join(paragraphs)

    return {
        'headline':      headline,
        'description':   description,
        'section':       section,
        'content_topic': content_topic,
        'authors':       authors,
        'published':     published,
        'modified':      date_modified,
        'image_url':     image_url,
        'thumbnail_url': thumbnail_url,
        'main_entity':   main_entity,
        'content':       content,
        'url':           url
    }


def fetch_all_categories(limit=500):
    client     = bbc_feeds.news()
    categories = ["all", "world", "uk", "north_america",
                  "entertainment", "business", "tech", "science"]

    fieldnames = [
        'headline', 'description', 'section', 'content_topic',
        'authors', 'published', 'modified', 'image_url',
        'thumbnail_url', 'main_entity', 'content', 'url'
    ]
    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for cat in categories:
            stories = getattr(client, cat)(limit=limit)
            print(f"\n=== Category: {cat.upper()} ({len(stories)} items) ===")
            for story in tqdm(stories, desc=f"{cat.upper()}", unit="article"):
                info = parse_article(story.link)
                if info:
                    writer.writerow(info)
                    print(f"{cat.upper()} -> {info['headline']}")
                # gentle random delay to avoid hammering
                time.sleep(random.uniform(1, 3))

    print(f"All data exported to '{OUTPUT_CSV}'")


if __name__ == "__main__":
    fetch_all_categories(limit=500)


In [None]:
import pandas as pd
df = pd.read_csv("bbc_news_articles.csv")

df.iloc[1]["content"]

In [None]:
df

In [None]:
df_filtered = df[df['content'].notna()]

In [None]:
df_filtered

In [None]:
import pandas as pd

# make sure 'published' is a datetime dtype (with UTC)
df['published'] = pd.to_datetime(df['published'], utc=True)

# get earliest and latest
min_pub = df['published'].min()
max_pub = df['published'].max()

print(f"Published date range: {min_pub} to {max_pub}")