In [295]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from bs4 import BeautifulSoup
import time

## BBC Scraper

In [286]:
# News Source
news_sources = [
    {
        "publisher": "BBC",
        "base_url": "https://www.bbc.com",
        "sections": {
            "Innovation": "/innovation",
            "Business": "/business",
            "Culture": "/culture",
            "Earth": "/future-planet",
            "Entertainment": "/culture/entertainment-news",
            "Arts": "/arts",
            "Travel": "/travel",
        }
    },
]

In [285]:
# Initialize browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 15)

In [293]:
def scrape_articles(url, category):
    driver.get(url)
    articles_data = []
    
    while True:
        try:
            # 1. Find the parent container
            container = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'div[class*="sc-c5803051-0"]')  # Parent container
            ))
            
            # 2. Find INDIVIDUAL ARTICLE CARDS within the container
            cards = container.find_elements(By.CSS_SELECTOR, 'a[data-testid="internal-link"]')  # Actual article elements
            
            # print(f"Found {len(cards)} individual article cards")
            
            # Extract data from each card
            for card in cards:
                try:
                    # Extract url
                    url = card.get_attribute('href')
                    
                    # Extract headline
                    headline = card.find_element(By.CSS_SELECTOR, 'h2[data-testid="card-headline"]').text
                    
                    articles_data.append({
                        'headline': headline,
                        'category': category,
                        'url': url
                    })
                except Exception as e:
                    print(f"Skipping card due to error: {str(e)}")
                    continue
                    
            # print(f"Collected {len(articles_data)} articles so far")
        
            # Scroll to bottom to expose pagination controls
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            # scroll up a bit
            driver.execute_script("window.scrollBy(0, -300)")
            time.sleep(1)
            
            # Find and click next button
            next_button = wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]:not([disabled])')))
            next_button.click()
            # print("Clicked Next Page button")
            
            # Reset scroll position for new page
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(1.5)
            # gradual_scroll()  # Find "More" section again
            
        except Exception as e:
            print(f"Pagination ended: {str(e)}")
            break

    print(f"Collected {len(articles_data)} articles")
    
    return articles_data

In [289]:
# Scrape from all webpages
articles_data = []
for source in news_sources:
    for category in source['sections']:
        url = f"{source['base_url']}{source['sections'][category]}"
        print(f"Scraping {source['publisher']} - {category} From URL: {url}")
        articles_data.extend(scrape_articles(url, category))
print("Total articles: ",len(articles_data))

Scraping BBC - Innovation From URL: https://www.bbc.com/innovation
Pagination ended: Message: 
Stacktrace:
	GetHandleVerifier [0x0129FC03+61635]
	GetHandleVerifier [0x0129FC44+61700]
	(No symbol) [0x010C05D3]
	(No symbol) [0x0110899E]
	(No symbol) [0x01108D3B]
	(No symbol) [0x01150E12]
	(No symbol) [0x0112D2E4]
	(No symbol) [0x0114E61B]
	(No symbol) [0x0112D096]
	(No symbol) [0x010FC840]
	(No symbol) [0x010FD6A4]
	GetHandleVerifier [0x01524523+2701795]
	GetHandleVerifier [0x0151FCA6+2683238]
	GetHandleVerifier [0x0153A9EE+2793134]
	GetHandleVerifier [0x012B68C5+155013]
	GetHandleVerifier [0x012BCFAD+181357]
	GetHandleVerifier [0x012A7458+92440]
	GetHandleVerifier [0x012A7600+92864]
	GetHandleVerifier [0x01291FF0+5296]
	BaseThreadInitThunk [0x76747BA9+25]
	RtlInitializeExceptionChain [0x77B1C0CB+107]
	RtlClearBits [0x77B1C04F+191]

Collected 99 articles
Scraping BBC - Business From URL: https://www.bbc.com/business
Pagination ended: Message: element click intercepted: Element <button da

In [294]:
print(len(articles_data))
driver.quit()

577


In [292]:
# Save to CSV
df = pd.DataFrame(articles_data)
df = df.drop_duplicates(subset=['url'])  # Remove duplicates
df.to_csv('bbc_scraped_news_articles.csv', index=False)
print(f"Saved {len(df)} articles to CSV")

Saved 516 articles to CSV


## CNBC Scraper

In [526]:
# News Source
news_sources = [
    {
        "publisher": "CNBC",
        "base_url": "https://www.cnbc.com",
        "sections": {
            "Economy": "/economy",
            "Technology": "/technology",
            "Investing": "/investing",
            "Climate": "/climate",
            "Sports": "/sport",
            "Media": "/media",
            "Life": "/life",
        }
    },
]

In [527]:
# Initialize browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 15)

In [528]:
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException

def close_popup(driver):
    print("Attempting to close popup...")
    try:
        # Wait for the iframe itself to be present
        zephr_iframe = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'iframe.zephrIframeOutcome.OverlayApplicationPro'))
        )
        
        print(f"Found zephrIframeOutcome iframe. Switching to it (src: {zephr_iframe.get_attribute('src')}).")
        driver.switch_to.frame(zephr_iframe)
        
        # Now, within the iframe, find the close button.
        # Based on the previous HTML you shared (which might now be *inside* this iframe)
        # the button has data-testid="analytics-click" and a class starting with "CloseButton_closeButton"
        close_button_in_iframe_selector = 'button[data-testid="analytics-click"][class*="CloseButton_closeButton"]'
        
        close_btn_in_iframe = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, close_button_in_iframe_selector))
        )
        
        if close_btn_in_iframe.is_displayed():
            driver.execute_script("arguments[0].click();", close_btn_in_iframe)
            print(f"Closed zephrIframeOutcome popup from within iframe using selector: {close_button_in_iframe_selector}")
            time.sleep(1)
            driver.switch_to.default_content() # IMPORTANT: Switch back to main content
            return True

    except (TimeoutException, NoSuchElementException):
        print("zephrIframeOutcome popup or its close button not found within iframe.")
    except WebDriverException as e:
        print(f"WebDriver error interacting with zephrIframeOutcome iframe: {e}")
    finally:
        # Always attempt to switch back to default content, even if previous steps failed
        # This prevents subsequent operations from failing if we are stuck in an iframe
        try:
            driver.switch_to.default_content()
            print("Switched back to default content.")
        except Exception:
            pass # Already on default content or other issue
            
    return False

In [529]:
def scrape_articles(url, category):
    driver.get(url)
    articles = []
    num_articles = 100

    while len(articles) < num_articles:
        # Let the page load
        time.sleep(10)
        # close popup if appears
        close_popup(driver)
        
        # Click to load more content (if needed)
        load_more_button = driver.find_element(By.CLASS_NAME, "LoadMoreButton-loadMore")

        # click load more a few times
        for i in range(5):
            if load_more_button.is_displayed():
                load_more_button.click()
                time.sleep(1)
        
        # Parse the page with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find all article links - adjust selector as needed
        article_links = soup.select('a[href*="/20"]')  # Links containing year-based URLs
        
        for link in article_links:
            if len(articles) >= num_articles:
                break
                
            headline = link.text.strip()
            url = link['href']
            
            # Validate URL
            if url.startswith('https://www.cnbc.com/') and headline:
                articles.append({
                    'headline': headline,
                    'url': url,
                    'category': category
                })
                # print(f"Found article {len(articles)}: {headline}")
    
    return articles

In [530]:
# Scrape from all webpages
articles_data = []
count = 0
for source in news_sources:
    for category in source['sections']:
        count+= 1
        if(count%100 == 0):
            # exit and restart
            driver.quit()
            time.sleep(10)
            # Initialize browser
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
            wait = WebDriverWait(driver, 15)
        url = f"{source['base_url']}{source['sections'][category]}"
        print(f"Scraping {source['publisher']} - {category} From URL: {url}")
        articles_data.extend(scrape_articles(url, category))
print("Total articles: ",len(articles_data))

Scraping CNBC - Economy From URL: https://www.cnbc.com/economy
Attempting to close popup...
zephrIframeOutcome popup or its close button not found within iframe.
Switched back to default content.
Scraping CNBC - Technology From URL: https://www.cnbc.com/technology
Attempting to close popup...
zephrIframeOutcome popup or its close button not found within iframe.
Switched back to default content.
Scraping CNBC - Investing From URL: https://www.cnbc.com/investing
Attempting to close popup...
Found zephrIframeOutcome iframe. Switching to it (src: https://zephr-templates.cnbc.com/components/OverlayApplicationPro?analyticsPageId=&analyticsTrackingId=&analyticsOutcomeId=zephr-overlay-ICMemorialDay2025INTL&analyticsOutcomeType=Overlay&analyticsOutcomeDetail=Zephr%7CDTCPromotion%7CInvestingClub%7CAcquisition%7COverlay%7CInternational%7CFreeArticles%7CMemorialDay2025&backgroundImageSrc=https%3A%2F%2Fimage.cnbcfm.com%2Fapi%2Fv1%2Fimage%2F108060557-1731095256460-IC_BFCM_2024_ZEPHR_OVERLAY_611x611_

In [531]:
print(len(articles_data))
driver.quit()

700


In [532]:
# Save to CSV
df = pd.DataFrame(articles_data)
df = df.drop_duplicates(subset=['url'])  # Remove duplicates
df.to_csv('cnbc_scraped_news_articles.csv', index=False)
print(f"Saved {len(df)} articles to CSV")

Saved 643 articles to CSV


## CNN Scraper