In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from bs4 import BeautifulSoup
import time
import requests
from tqdm import tqdm
from datetime import datetime

## BBC Scraper

In [47]:
# News Source
news_sources = [
    {
        "publisher": "BBC",
        "base_url": "https://www.bbc.com",
        "sections": {
            "Business": "/business",
            "Innovation": "/innovation",
            "Culture": "/culture",
            "Earth": "/future-planet",
            "Entertainment": "/culture/entertainment-news",
            "Arts": "/arts",
            "Travel": "/travel",
        }
    },
]

In [48]:
# Initialize browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 15)

In [49]:
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException

def close_popup(driver):
    print("Attempting to close popup...")
    try:
        # Wait for the iframe itself to be present
        bbc_popup = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'iframe[id^="offer-0-"]'))
        )
        
        print(f"Found BBC popup iframe. Switching to it (src: {bbc_popup.get_attribute('src')}).")
        driver.switch_to.frame(bbc_popup)
        
        # Now, within the iframe, find the close button.
        close_button_in_iframe_selector = '.pn-article__close'
        
        close_btn_in_iframe = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, close_button_in_iframe_selector))
        )
        
        if close_btn_in_iframe.is_displayed():
            driver.execute_script("arguments[0].click();", close_btn_in_iframe)
            print(f"Closed zephrIframeOutcome popup from within iframe using selector: {close_button_in_iframe_selector}")
            time.sleep(1)
            driver.switch_to.default_content() # IMPORTANT: Switch back to main content
            return True

    except (TimeoutException, NoSuchElementException):
        print("Popup or its close button not found within iframe.")
    except WebDriverException as e:
        print(f"WebDriver error interacting with Popup iframe: {e}")
    finally:
        # Always attempt to switch back to default content, even if previous steps failed
        # This prevents subsequent operations from failing if we are stuck in an iframe
        try:
            driver.switch_to.default_content()
            print("Switched back to default content.")
        except Exception:
            pass # Already on default content or other issue
            
    return False

In [50]:
def scrape_articles(url, category):
    driver.get(url)
    articles_data = []

    # Let the page load
    time.sleep(5)
    # close popup if appears
    close_popup(driver)
    
    while True:
        try:
            # 1. Find the parent container
            container = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'div[class*="sc-c5803051-0"]')  # Parent container
            ))
            
            # 2. Find INDIVIDUAL ARTICLE CARDS within the container
            cards = container.find_elements(By.CSS_SELECTOR, 'a[data-testid="internal-link"]')  # Actual article elements
            
            # print(f"Found {len(cards)} individual article cards")
            
            # Extract data from each card
            for card in cards:
                try:
                    # Extract url
                    url = card.get_attribute('href')
                    
                    # Extract headline
                    headline = card.find_element(By.CSS_SELECTOR, 'h2[data-testid="card-headline"]').text
                    
                    articles_data.append({
                        'headline': headline,
                        'category': category,
                        'url': url
                    })
                except Exception as e:
                    print(f"Skipping card due to error: {str(e)}")
                    continue
                    
            # print(f"Collected {len(articles_data)} articles so far")
        
            # Scroll to bottom to expose pagination controls
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            # scroll up a bit
            driver.execute_script("window.scrollBy(0, -300)")
            time.sleep(1)
            
            # Find and click next button
            next_button = wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]:not([disabled])')))
            next_button.click()
            # print("Clicked Next Page button")
            
            # Reset scroll position for new page
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(1.5)
            # gradual_scroll()  # Find "More" section again
            
        except Exception as e:
            print(f"Pagination ended: {str(e)}")
            break

    print(f"Collected {len(articles_data)} articles")
    
    return articles_data

In [51]:
# Scrape from all webpages
articles_data = []
for source in news_sources:
    for category in source['sections']:
        url = f"{source['base_url']}{source['sections'][category]}"
        print(f"Scraping {source['publisher']} - {category} From URL: {url}")
        articles_data.extend(scrape_articles(url, category))
print("Total articles: ",len(articles_data))

Scraping BBC - Business From URL: https://www.bbc.com/business
Attempting to close popup...
Popup or its close button not found within iframe.
Switched back to default content.
Pagination ended: Message: 
Stacktrace:
	GetHandleVerifier [0x004FFC03+61635]
	GetHandleVerifier [0x004FFC44+61700]
	(No symbol) [0x003205D3]
	(No symbol) [0x0036899E]
	(No symbol) [0x00368D3B]
	(No symbol) [0x003B0E12]
	(No symbol) [0x0038D2E4]
	(No symbol) [0x003AE61B]
	(No symbol) [0x0038D096]
	(No symbol) [0x0035C840]
	(No symbol) [0x0035D6A4]
	GetHandleVerifier [0x00784523+2701795]
	GetHandleVerifier [0x0077FCA6+2683238]
	GetHandleVerifier [0x0079A9EE+2793134]
	GetHandleVerifier [0x005168C5+155013]
	GetHandleVerifier [0x0051CFAD+181357]
	GetHandleVerifier [0x00507458+92440]
	GetHandleVerifier [0x00507600+92864]
	GetHandleVerifier [0x004F1FF0+5296]
	BaseThreadInitThunk [0x76747BA9+25]
	RtlInitializeExceptionChain [0x77B1C0CB+107]
	RtlClearBits [0x77B1C04F+191]

Collected 45 articles
Scraping BBC - Innovation

In [52]:
print(len(articles_data))
driver.quit()

613


In [53]:
# Save to CSV
df = pd.DataFrame(articles_data)
# df = df.drop_duplicates(subset=['url'])  # Remove duplicates
df.to_csv('bbc_scraped_news_articles.csv', index=False)
print(f"Saved {len(df)} articles to CSV")

Saved 613 articles to CSV


## CNBC Scraper

In [54]:
# News Source
news_sources = [
    {
        "publisher": "CNBC",
        "base_url": "https://www.cnbc.com",
        "sections": {
            "Economy": "/economy",
            "Technology": "/technology",
            "Travel": "/travel",
            "Climate": "/climate",
            "Entertainment": "/entertainment",
            "Media": "/media",
            "Life": "/life",
        }
    },
]

In [55]:
# Initialize browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 15)

In [56]:
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException

def close_popup(driver):
    print("Attempting to close popup...")
    try:
        # Wait for the iframe itself to be present
        zephr_iframe = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'iframe.zephrIframeOutcome.OverlayApplicationPro'))
        )
        
        print(f"Found zephrIframeOutcome iframe. Switching to it (src: {zephr_iframe.get_attribute('src')}).")
        driver.switch_to.frame(zephr_iframe)
        
        # Now, within the iframe, find the close button.
        # Based on the previous HTML you shared (which might now be *inside* this iframe)
        # the button has data-testid="analytics-click" and a class starting with "CloseButton_closeButton"
        close_button_in_iframe_selector = 'button[data-testid="analytics-click"][class*="CloseButton_closeButton"]'
        
        close_btn_in_iframe = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, close_button_in_iframe_selector))
        )
        
        if close_btn_in_iframe.is_displayed():
            driver.execute_script("arguments[0].click();", close_btn_in_iframe)
            print(f"Closed zephrIframeOutcome popup from within iframe using selector: {close_button_in_iframe_selector}")
            time.sleep(1)
            driver.switch_to.default_content() # IMPORTANT: Switch back to main content
            return True

    except (TimeoutException, NoSuchElementException):
        print("zephrIframeOutcome popup or its close button not found within iframe.")
    except WebDriverException as e:
        print(f"WebDriver error interacting with zephrIframeOutcome iframe: {e}")
    finally:
        # Always attempt to switch back to default content, even if previous steps failed
        # This prevents subsequent operations from failing if we are stuck in an iframe
        try:
            driver.switch_to.default_content()
            print("Switched back to default content.")
        except Exception:
            pass # Already on default content or other issue
            
    return False

In [57]:
def scrape_articles(url, category):
    driver.get(url)
    articles = []
    num_articles = 100

    while len(articles) < num_articles:
        # Let the page load
        time.sleep(10)
        # close popup if appears
        close_popup(driver)
        
        # Click to load more content (if needed)
        load_more_button = driver.find_element(By.CLASS_NAME, "LoadMoreButton-loadMore")

        # click load more a few times
        for i in range(5):
            if load_more_button.is_displayed():
                load_more_button.click()
                time.sleep(1)
        
        # Parse the page with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find all article links - adjust selector as needed
        article_links = soup.select('a[href*="/20"]')  # Links containing year-based URLs
        
        for link in article_links:
            if len(articles) >= num_articles:
                break
                
            headline = link.text.strip()
            url = link['href']
            
            # Validate URL
            if url.startswith('https://www.cnbc.com/') and headline:
                articles.append({
                    'headline': headline,
                    'url': url,
                    'category': category
                })
                # print(f"Found article {len(articles)}: {headline}")
    
    return articles

In [58]:
# Scrape from all webpages
articles_data = []
count = 0
for source in news_sources:
    for category in source['sections']:
        count+= 1
        if(count%100 == 0):
            # exit and restart
            driver.quit()
            time.sleep(10)
            # Initialize browser
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
            wait = WebDriverWait(driver, 15)
        url = f"{source['base_url']}{source['sections'][category]}"
        print(f"Scraping {source['publisher']} - {category} From URL: {url}")
        articles_data.extend(scrape_articles(url, category))
print("Total articles: ",len(articles_data))

Scraping CNBC - Economy From URL: https://www.cnbc.com/economy
Attempting to close popup...
zephrIframeOutcome popup or its close button not found within iframe.
Switched back to default content.
Scraping CNBC - Technology From URL: https://www.cnbc.com/technology
Attempting to close popup...
zephrIframeOutcome popup or its close button not found within iframe.
Switched back to default content.
Scraping CNBC - Travel From URL: https://www.cnbc.com/travel
Attempting to close popup...
Found zephrIframeOutcome iframe. Switching to it (src: https://zephr-templates.cnbc.com/components/OverlayApplicationPro?analyticsPageId=&analyticsTrackingId=&analyticsOutcomeId=zephr-overlay-ICMemorialDay2025INTL&analyticsOutcomeType=Overlay&analyticsOutcomeDetail=Zephr%7CDTCPromotion%7CInvestingClub%7CAcquisition%7COverlay%7CInternational%7CFreeArticles%7CMemorialDay2025&backgroundImageSrc=https%3A%2F%2Fimage.cnbcfm.com%2Fapi%2Fv1%2Fimage%2F108060557-1731095256460-IC_BFCM_2024_ZEPHR_OVERLAY_611x611_v01_1x

In [59]:
print(len(articles_data))
driver.quit()

700


In [60]:
# Save to CSV
df = pd.DataFrame(articles_data)
# df = df.drop_duplicates(subset=['url'])  # Remove duplicates
df.to_csv('cnbc_scraped_news_articles.csv', index=False)
print(f"Saved {len(df)} articles to CSV")

Saved 700 articles to CSV


## Reuters Scraper

In [552]:
# News Source
news_sources = [
    {
        "publisher": "Reuters",
        "base_url": "https://www.reuters.com",
        "sections": {
            "Artificial Intelligence": "/technology/artificial-intelligence",
            "Business": "/business",
            "Weather": "/world/weather",
            "Sports": "/sports",
            "Lifestyle": "/lifestyle",
            "Climate and Energy": "/sustainability/climate-energy",
            "Government": "/legal/government",
        }
    },
]

In [553]:
# Initialize browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 15)

In [554]:
from selenium.common.exceptions import TimeoutException

def scrape_articles(url, category):
    driver.get(url)
    articles = []
    num_articles = 100

    while len(articles) < num_articles:
        # Let the page load
        time.sleep(10)

        try:
            # Click to load more articles (if needed)
            load_more_button = driver.find_element(By.XPATH, "//button//span[text()='Load more articles']")

            # click load more a few times
            for i in range(5):
                if load_more_button.is_displayed():
                    load_more_button.click()
                    time.sleep(1)
                
        except (TimeoutException):
            pass
        
        # Parse the page with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find all article links - adjust selector as needed
        article_links = soup.select('a[data-testid="TitleLink"]') 
        
        for link in article_links:
            if len(articles) >= num_articles:
                break
                
            # Extract headline from the nested span with data-testid="TitleHeading"
            headline_span = link.find('span', {'data-testid': 'TitleHeading'})
            if headline_span:
                headline = headline_span.text.strip()
            else:
                headline = link.text.strip()  # Fallback to full link text
            
            # Extract URL from href attribute
            url = link.get('href', '')
            
            # Validate URL
            if url.startswith('https://www.reuter.com/') and headline:
                articles.append({
                    'headline': headline,
                    'url': url,
                    'category': category
                })
                # print(f"Found article {len(articles)}: {headline}")
    
    return articles

In [555]:
# Scrape from all webpages
articles_data = []
count = 0
for source in news_sources:
    for category in source['sections']:
        count+= 1
        if(count%100 == 0):
            # exit and restart
            driver.quit()
            time.sleep(10)
            # Initialize browser
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
            wait = WebDriverWait(driver, 15)
        url = f"{source['base_url']}{source['sections'][category]}"
        print(f"Scraping {source['publisher']} - {category} From URL: {url}")
        articles_data.extend(scrape_articles(url, category))
print("Total articles: ",len(articles_data))

Scraping Reuters - Artificial Intelligence From URL: https://www.reuters.com/technology/artificial-intelligence


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//button//span[text()='Load more articles']"}
  (Session info: chrome=136.0.7103.114); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x0129FC03+61635]
	GetHandleVerifier [0x0129FC44+61700]
	(No symbol) [0x010C05D3]
	(No symbol) [0x0110899E]
	(No symbol) [0x01108D3B]
	(No symbol) [0x01150E12]
	(No symbol) [0x0112D2E4]
	(No symbol) [0x0114E61B]
	(No symbol) [0x0112D096]
	(No symbol) [0x010FC840]
	(No symbol) [0x010FD6A4]
	GetHandleVerifier [0x01524523+2701795]
	GetHandleVerifier [0x0151FCA6+2683238]
	GetHandleVerifier [0x0153A9EE+2793134]
	GetHandleVerifier [0x012B68C5+155013]
	GetHandleVerifier [0x012BCFAD+181357]
	GetHandleVerifier [0x012A7458+92440]
	GetHandleVerifier [0x012A7600+92864]
	GetHandleVerifier [0x01291FF0+5296]
	BaseThreadInitThunk [0x76747BA9+25]
	RtlInitializeExceptionChain [0x77B1C0CB+107]
	RtlClearBits [0x77B1C04F+191]


In [None]:
print(len(articles_data))
driver.quit()

In [None]:
# Save to CSV
df = pd.DataFrame(articles_data)
df = df.drop_duplicates(subset=['url'])  # Remove duplicates
df.to_csv('reuters_scraped_news_articles.csv', index=False)
print(f"Saved {len(df)} articles to CSV")

# Combined Articles Data

In [61]:
def extract_article_details(url, publisher):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        start_time = time.time()
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract publication date
        published_date = None
        
        # Common meta tag approach
        meta_date = soup.find('meta', property='article:published_time')
        if meta_date:
            published_date = meta_date['content']
        else:
            # Fallback for different date formats
            time_tag = soup.find('time')
            if time_tag and 'datetime' in time_tag.attrs:
                published_date = time_tag['datetime']
            elif time_tag:
                published_date = time_tag.text.strip()

        # Convert to standard format
        if published_date:
            try:
                published_date = datetime.fromisoformat(published_date.replace('Z', '+00:00')).strftime('%Y-%m-%d %H:%M:%S')
            except Exception as e:
                print(f"⚠️ Date format conversion failed: {str(e)}")
                print(f"⏳ Keeping original date format: {published_date}")

        # Extract article content based on publisher
        article_content = []
        
        if publisher.lower() == 'bbc':
            content_blocks = soup.find_all('div', {'data-component': 'text-block'})
            article_content = [block.text.strip() for block in content_blocks]
        elif publisher.lower() == 'cnbc':
            content_blocks = soup.select('div.group p')
            article_content = [block.text.strip() for block in content_blocks]
        
        # Join paragraphs with newlines
        full_content = '\n\n'.join(article_content)

        duration = time.time() - start_time
        
        return {
            'published_date': published_date,
            'article_content': full_content
        }

    except Exception as e:
        print(f"❌ Error scraping {url}: {str(e)}")
        return None

In [62]:
# list of all publishers and their data file
publishers = [
    {
    'name':'bbc',
    'file':'bbc_scraped_news_articles.csv',
    },
    {
    'name':'cnbc',
    'file':'cnbc_scraped_news_articles.csv',
    },
]

# Load existing data
df = pd.DataFrame()
for publisher in publishers:
    # n = 5
    # temp = pd.read_csv(publisher['file'], nrows = n)
    temp = pd.read_csv(publisher['file'])
    temp['publisher'] = publisher['name']
    df = pd.concat([df, temp],ignore_index=True)

# Add new columns
df['published_date'] = None
df['article_content'] = None
# df['publisher'] = None

success = 0
total = len(df)
start_time = time.time()

# Scrape additional data with rate limiting
try:
    # Configure tqdm progress bar
    with tqdm(total=total, desc="Scraping Articles", unit="article") as pbar:
        for index, row in df.iterrows():
            result = extract_article_details(row['url'], row['publisher'])
            
            if result:
                df.at[index, 'published_date'] = result['published_date']
                df.at[index, 'article_content'] = result['article_content']
                df.at[index, 'publisher'] = row['publisher']
                success += 1
                
            pbar.update(1)
            time.sleep(0.25)  # Maintain polite delay
            
except KeyboardInterrupt:
    print("\n⚠️ User interrupted the process! Saving current progress...")

finally:
    # Reorder columns and exclude URL
    new_column_order = [
        'published_date',
        'headline', 
        'publisher',
        'article_content',
        'category'
    ]
    
    # Create new DataFrame with desired columns
    final_df = df[new_column_order]
    
    # Save updated CSV
    print("\n💾 Saving progress...")
    final_df.to_csv('combined_news_articles_data.csv', index=False)
    # print(f"💽 Saved {success_count} articles to combined_news_articles_data.csv")
    # print(f"📈 Success rate: {success_count/processed_count:.2%}") if processed_count > 0 else ""
    # Final output
    print(f"\n✅ Completed: {success}/{total} articles successfully scraped")
    print(f"💾 Saving final file...")
    print(f"⏱️ Total processing time: {time.time() - start_time:.2f} seconds")

Scraping Articles: 100%|██████████████████████████████████████████████████████| 1313/1313 [26:59<00:00,  1.23s/article]



💾 Saving progress...

✅ Completed: 1313/1313 articles successfully scraped
💾 Saving final file...
⏱️ Total processing time: 1619.58 seconds
