# Initialization

In [None]:
import nltk
print(nltk.__version__)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Part 1: Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time

In [2]:
news_sources = [
    {
        "publisher": "BBC",
        "base_url": "https://www.bbc.com",
        "sections": {
            "World": "/news/world",
            "Arts": "/arts",
        }
    },
    {
        "publisher": "CNN",
        "base_url": "https://edition.cnn.com",
        "sections": {
            "Politics": "/politics",
            "Sports": "/sport",
        }
    },
    {
        "publisher": "CNBC",
        "base_url": "https://www.cnbc.com",
        "sections": {
            "Health and Science": "/health-and-science",
            "AI": "/ai-artificial-intelligence",
        }
    }
]

news_articles = "articles.csv"

In [16]:
# helper to write header to csv
def init_csv():
    with open(news_articles, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "published_date", "headline", "publisher", "category", "article_content", "url"
        ])
        writer.writeheader()

In [9]:
# Fetch list of article URLs from a section page
def get_article_links(source, category, page=1):
    """
    Returns a list of absolute article URLs for a given section and page number.
    """
    url = f"{source['base_url']}{source['sections'][category]}"
    if page > 1:
        url += f"/{page}"

    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    links = []
    for a in soup.select(".gs-c-promo-heading a[href]"):
        href = a.get("href")
        if href.startswith("/news"):
            full = source['base_url'] + href
            links.append(full)

    return list(set(links))

In [10]:
# Parse individual article for metadata and content
def parse_article(url, publisher, category):
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Published date (BBC example: time tag)
    time_tag = soup.find("time")
    published_date = time_tag.get("datetime") if time_tag else ""

    # Headline
    headline_tag = soup.find("h1")
    headline = headline_tag.get_text(strip=True) if headline_tag else ""

    # Article content: collect all <p> text under article body
    paragraphs = []
    for p in soup.select("[property=articleBody] p, .ssrcss-uf6wea-RichTextComponentWrapper p"):
        paragraphs.append(p.get_text(strip=True))
    article_content = "\n".join(paragraphs)

    return {
        "published_date": published_date,
        "headline": headline,
        "publisher": publisher,
        "category": category,
        "article_content": article_content,
        "url": url
    }

In [22]:
import re
import pandas as pd

In [33]:
url = "https://www.bbc.com/business"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
response.content

# save response to a text file
file_path = "response.txt"
with open(file_path, "w",  encoding="utf-8") as file:
    file.write(response.text)
    print(f"Response saved to {file_path}")

Response saved to response.txt


In [26]:
all_data = soup.text.replace('\n', "")
all_data

'BBC News - Breaking news, video and the latest top stories from the U.S. and around the worldSkip to contentBritish Broadcasting CorporationHomeNewsSportBusinessInnovationCultureArtsTravelEarthAudioVideoLiveIsrael-Gaza WarWar in UkraineUS & CanadaUKAfricaAsiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifyHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifySportBusinessExecutive LoungeTechnology of BusinessFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDestinationsAfricaAntarcticaAsiaAustralia and PacificCaribbean & BermudaCentral AmericaEuropeMiddle EastNorth AmericaSouth AmericaWorld’s TableCulture & ExperiencesAdventuresThe SpeciaListEarthNatural WondersWea

In [23]:
# remove all digits
remove_digit = re.sub(r'[0-9]+' , '', all_data)
remove_digit

'BBC Home - Breaking News, World News, US News, Sports, Business, Innovation, Climate, Culture, Travel, Video & AudioSkip to contentBritish Broadcasting CorporationHomeNewsSportBusinessInnovationCultureArtsTravelEarthAudioVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifySportBusinessExecutive LoungeTechnology of BusinessFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDestinationsAfricaAntarcticaAsiaAustralia and PacificCaribbean & BermudaCentral AmericaEuropeMiddle EastNorth AmericaSouth AmericaWorld’s TableCulture & ExperiencesAdventuresThe SpeciaListEarthNatural WondersWeather & ScienceClimate SolutionsSustainable BusinessGreen LivingAudioPodcastsRadioAudio FAQsVideoLive

In [14]:
# Main loop: iterate sources, categories, pages, and articles
def scrape_all(max_pages=3, delay=1.0):
    print("before csv init")
    init_csv()
    print("csv header initialized!")
    for source in news_sources:
        for category in source['sections']:
            print(f"Scraping {source['publisher']} - {category}")
            for page in range(1, max_pages + 1):
                try:
                    links = get_article_links(source, category, page)
                except Exception as e:
                    print(f"Failed to fetch page {page}: {e}")
                    continue

                for url in links:
                    try:
                        record = parse_article(url, source['publisher'], category)
                        # Append to CSV
                        with open(news_articles, mode="a", newline="", encoding="utf-8") as f:
                            writer = csv.DictWriter(f, fieldnames=record.keys())
                            writer.writerow(record)
                        print(f"Saved: {record['headline']}")
                        time.sleep(delay)
                    except Exception as e:
                        print(f"Error parsing {url}: {e}")

In [17]:
# Adjust max_pages to collect more articles per section
scrape_all(max_pages=5, delay=0.5)

before csv init
csv header initialized!
Scraping BBC - World
Failed to fetch page 2: 404 Client Error: Not Found for url: https://www.bbc.com/news/world/2
Failed to fetch page 3: 404 Client Error: Not Found for url: https://www.bbc.com/news/world/3
Failed to fetch page 4: 404 Client Error: Not Found for url: https://www.bbc.com/news/world/4
Failed to fetch page 5: 404 Client Error: Not Found for url: https://www.bbc.com/news/world/5
Scraping BBC - Arts
Failed to fetch page 2: 404 Client Error: Not Found for url: https://www.bbc.com/arts/2
Failed to fetch page 3: 404 Client Error: Not Found for url: https://www.bbc.com/arts/3
Failed to fetch page 4: 404 Client Error: Not Found for url: https://www.bbc.com/arts/4
Failed to fetch page 5: 404 Client Error: Not Found for url: https://www.bbc.com/arts/5
Scraping CNN - Politics
Failed to fetch page 2: 404 Client Error: Not Found for url: https://edition.cnn.com/politics/2
Failed to fetch page 3: 404 Client Error: Not Found for url: https://ed