In [243]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

## BBC Scraper

In [285]:
# Initialize browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 15)

In [286]:
# News Source
news_sources = [
    {
        "publisher": "BBC",
        "base_url": "https://www.bbc.com",
        "sections": {
            "Innovation": "/innovation",
            "Business": "/business",
            "Culture": "/culture",
            "Earth": "/future-planet",
            "Entertainment": "/culture/entertainment-news",
            "Arts": "/arts",
            "Travel": "/travel",
        }
    },
]

In [287]:
def gradual_scroll():
    """Scroll gradually to trigger lazy-loading"""
    current_position = 4500
    scroll_increment = 500  # pixels
    max_attempts = 10
    
    for _ in range(max_attempts):
        driver.execute_script(f"window.scrollTo(0, {current_position});")
        current_position += scroll_increment
        time.sleep(1.0)  # Adjust based on connection speed
        
        # Check if "More" section becomes visible
        try:
            driver.find_element(By.XPATH, "//h2[contains(text(), 'More')]")
            print("Found 'More' section!")
            return True
        except:
            continue

    print("Failed to find 'More on' section after scrolling")
    return False

In [288]:
def scrape_articles(url, cat):
    driver.get(url)
    articles_data = []
    
    # if not gradual_scroll():
    #     return articles_data
    
    while True:
        try:
            # 1. Find the parent container
            container = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'div[class*="sc-c5803051-0"]')  # Parent container
            ))
            
            # 2. Find INDIVIDUAL ARTICLE CARDS within the container
            cards = container.find_elements(By.CSS_SELECTOR, 'a[data-testid="internal-link"]')  # Actual article elements
            
            # print(f"Found {len(cards)} individual article cards")
            
            # Extract data from each card
            for card in cards:
                try:
                    # Extract url
                    url = card.get_attribute('href')
                    
                    # Extract headline
                    headline = card.find_element(By.CSS_SELECTOR, 'h2[data-testid="card-headline"]').text
                    
                    # Assign category automatically
                    # category = card.find_element(By.CSS_SELECTOR, 'span[data-testid="card-metadata-tag"]').text
                    
                    articles_data.append({
                        'headline': headline,
                        'category': category,
                        'url': url
                    })
                except Exception as e:
                    print(f"Skipping card due to error: {str(e)}")
                    continue
                    
            # print(f"Collected {len(articles_data)} articles so far")
        
            # Scroll to bottom to expose pagination controls
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            # scroll up a bit
            driver.execute_script("window.scrollBy(0, -300)")
            time.sleep(1)
            
            # Find and click next button
            next_button = wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]:not([disabled])')))
            next_button.click()
            # print("Clicked Next Page button")
            
            # Reset scroll position for new page
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(1.5)
            # gradual_scroll()  # Find "More" section again
            
        except Exception as e:
            print(f"Pagination ended: {str(e)}")
            break

    print(f"Collected {len(articles_data)} articles")
    
    return articles_data

In [289]:
# Scrape from all webpages
articles_data = []
for source in news_sources:
    for category in source['sections']:
        url = f"{source['base_url']}{source['sections'][category]}"
        print(f"Scraping {source['publisher']} - {category} From URL: {url}")
        articles_data.extend(scrape_articles(url, category))
print("Total articles: ",len(articles_data))

Scraping BBC - Innovation From URL: https://www.bbc.com/innovation
Pagination ended: Message: 
Stacktrace:
	GetHandleVerifier [0x0129FC03+61635]
	GetHandleVerifier [0x0129FC44+61700]
	(No symbol) [0x010C05D3]
	(No symbol) [0x0110899E]
	(No symbol) [0x01108D3B]
	(No symbol) [0x01150E12]
	(No symbol) [0x0112D2E4]
	(No symbol) [0x0114E61B]
	(No symbol) [0x0112D096]
	(No symbol) [0x010FC840]
	(No symbol) [0x010FD6A4]
	GetHandleVerifier [0x01524523+2701795]
	GetHandleVerifier [0x0151FCA6+2683238]
	GetHandleVerifier [0x0153A9EE+2793134]
	GetHandleVerifier [0x012B68C5+155013]
	GetHandleVerifier [0x012BCFAD+181357]
	GetHandleVerifier [0x012A7458+92440]
	GetHandleVerifier [0x012A7600+92864]
	GetHandleVerifier [0x01291FF0+5296]
	BaseThreadInitThunk [0x76747BA9+25]
	RtlInitializeExceptionChain [0x77B1C0CB+107]
	RtlClearBits [0x77B1C04F+191]

Collected 99 articles
Scraping BBC - Business From URL: https://www.bbc.com/business
Pagination ended: Message: element click intercepted: Element <button da

In [290]:
print(len(articles_data))

577


In [291]:
driver.quit()

In [292]:
# Save to CSV
df = pd.DataFrame(articles_data)
df = df.drop_duplicates(subset=['url'])  # Remove duplicates
df.to_csv('bbc_scraped_news_articles.csv', index=False)
print(f"Saved {len(df)} articles to CSV")

Saved 516 articles to CSV


## CNBC Scraper

## CNN Scraper