In [None]:
#!/usr/bin/env python3
"""
bbc_to_csv.py

Fetch BBC News across all categories, parse each article for metadata
and full content, then dump everything into bbc_news.csv.
"""

import time
import requests
from bs4 import BeautifulSoup
import bbc_feeds
import pandas as pd

# --- Constants ---
HEADERS = {"User-Agent": "bbc-news-scraper/1.0 (+https://yourdomain.example)"}
CATEGORIES = [
    "all", "world", "uk", "north_america",
    "entertainment", "business", "tech", "science"
]
CSV_PATH = "bbc_news.csv"
FETCH_LIMIT = 500   # adjust as needed
REQUEST_DELAY = 1   # seconds between requests

def parse_article(url):
    """Fetches URL and extracts title, section, author, publish time, and content."""
    resp = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(resp.text, 'html.parser')

    # Headline
    print("title: ", title)
    title_tag = soup.find(attrs={'data-component': 'headline-block'})
    title = title_tag.get_text(strip=True) if title_tag else None

    # Section & Author
    section_meta = soup.find('meta', property='article:section')
    author_meta  = soup.find('meta', property='article:author')
    section = section_meta['content'] if section_meta else None
    author  = author_meta['content']  if author_meta  else None

    # Publish Time
    pub_meta = soup.find('meta', property='article:published_time')
    if pub_meta and pub_meta.get('content'):
        published = pub_meta['content']
    else:
        time_tag = soup.find('time')
        published = time_tag['datetime'] if (time_tag and time_tag.has_attr('datetime')) else None

    # Full text
    paragraphs = []
    for block in soup.find_all('div', {'data-component': 'text-block'}):
        for p in block.find_all('p'):
            paragraphs.append(p.get_text(strip=True))
    content = "\n\n".join(paragraphs) if paragraphs else None

    return {
        "title":     title,
        "section":   section,
        "author":    author,
        "published": published,
        "url":       url,
        "content":   content
    }

def fetch_all_categories(limit=FETCH_LIMIT):
    """
    Fetches up to `limit` stories from each BBC category and
    returns a list of article-info dicts.
    """
    client = bbc_feeds.news()
    all_articles = []

    for cat in CATEGORIES:
        stories = getattr(client, cat)(limit=limit)
        for story in stories:
            # Basic RSS info
            info = {
                "rss_category": cat,
                "rss_title": story.title,
                "rss_link": story.link
            }
            # Fetch and parse the article itself
            try:
                article_data = parse_article(story.link)
            except Exception as e:
                article_data = {
                    "title": None, "section": None, "author": None,
                    "published": None, "url": story.link, "content": None
                }
            # Merge RSS + parsed data
            info.update(article_data)
            all_articles.append(info)

            time.sleep(REQUEST_DELAY)  # polite crawling

    return all_articles

if __name__ == "__main__":
    # 1. Fetch everything
    articles = fetch_all_categories()

    # 2. Convert to DataFrame
    df = pd.DataFrame(articles)

    # 3. Save to CSV
    df.to_csv(CSV_PATH, index=False, encoding='utf-8-sig')
    print(f"Saved {len(df)} articles to {CSV_PATH}")


In [None]:
import pandas as pd
df = pd.read_csv("bbc_news.csv")

df.iloc[1]["content"]