# Initialization

In [1]:
import nltk
print(nltk.__version__)

3.9.1


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\CTN new
[nltk_data]     Installation\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\CTN new
[nltk_data]     Installation\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\CTN new
[nltk_data]     Installation\AppData\Roaming\nltk_data...


True

# Part 1: Web Scraping

In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import time

In [5]:
news_sources = [
    {
        "publisher": "BBC",
        "base_url": "https://www.bbc.com",
        "sections": {
            "World": "/news/world",
            "Arts": "/arts",
        }
    },
    {
        "publisher": "CNN",
        "base_url": "https://edition.cnn.com",
        "sections": {
            "Politics": "/politics",
            "Sports": "/sport",
        }
    },
    {
        "publisher": "CNBC",
        "base_url": "https://www.cnbc.com",
        "sections": {
            "Health and Science": "/health-and-science",
            "AI": "/ai-artificial-intelligence",
        }
    }
]

news_articles = "articles.csv"

In [6]:
# write news to csv
def init_csv():
    with open(OUTPUT_CSV, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "published_date", "headline", "publisher", "category", "article_content", "url"
        ])
        writer.writeheader()

In [7]:
# Fetch list of article URLs from a section page
def get_article_links(source, category, page=1):
    """
    Returns a list of absolute article URLs for a given section and page number.
    """
    url = f"{source['base_url']}{source['sections'][category]}"
    if page > 1:
        url += f"/{page}"

    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    links = []
    for a in soup.select(".gs-c-promo-heading a[href]"):
        href = a.get("href")
        if href.startswith("/news"):
            full = source['base_url'] + href
            links.append(full)

    return list(set(links))

In [8]:
# Parse individual article for metadata and content
def parse_article(url, publisher, category):
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Published date (BBC example: time tag)
    time_tag = soup.find("time")
    published_date = time_tag.get("datetime") if time_tag else ""

    # Headline
    headline_tag = soup.find("h1")
    headline = headline_tag.get_text(strip=True) if headline_tag else ""

    # Article content: collect all <p> text under article body
    paragraphs = []
    for p in soup.select("[property=articleBody] p, .ssrcss-uf6wea-RichTextComponentWrapper p"):
        paragraphs.append(p.get_text(strip=True))
    article_content = "\n".join(paragraphs)

    return {
        "published_date": published_date,
        "headline": headline,
        "publisher": publisher,
        "category": category,
        "article_content": article_content,
        "url": url
    }

In [9]:
# Main loop: iterate sources, categories, pages, and articles
def scrape_all(max_pages=3, delay=1.0):
    init_csv()
    for source in SOURCES:
        for category in source['sections']:
            print(f"Scraping {source['publisher']} - {category}")
            for page in range(1, max_pages + 1):
                try:
                    links = get_article_links(source, category, page)
                except Exception as e:
                    print(f"Failed to fetch page {page}: {e}")
                    continue

                for url in links:
                    try:
                        record = parse_article(url, source['publisher'], category)
                        # Append to CSV
                        with open(OUTPUT_CSV, mode="a", newline="", encoding="utf-8") as f:
                            writer = csv.DictWriter(f, fieldnames=record.keys())
                            writer.writerow(record)
                        print(f"Saved: {record['headline']}")
                        time.sleep(delay)
                    except Exception as e:
                        print(f"Error parsing {url}: {e}")