In [149]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [150]:
# Initialize browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 15)

In [151]:
# BBC News section URLs
section_urls = [
    'https://www.bbc.com/innovation',
    'https://www.bbc.com/business',
    'https://www.bbc.com/news/science_and_environment',
    'https://www.bbc.com/culture/entertainment-news',
    'https://www.bbc.com/arts',
    'https://www.bbc.com/travel'
]

In [152]:
def gradual_scroll():
    """Scroll gradually to trigger lazy-loading"""
    current_position = 4500
    scroll_increment = 500  # pixels
    max_attempts = 10
    
    for _ in range(max_attempts):
        driver.execute_script(f"window.scrollTo(0, {current_position});")
        current_position += scroll_increment
        time.sleep(1.0)  # Adjust based on connection speed
        
        # Check if "More" section becomes visible
        try:
            driver.find_element(By.XPATH, "//h2[contains(text(), 'More')]")
            print("Found 'More' section!")
            return True
        except:
            continue

    print("Failed to find 'More on' section after scrolling")
    return False

In [153]:
def scrape_articles(url):
    driver.get(url)
    articles_data = []
    
    if not gradual_scroll():
        return articles_data

    # print("waiting to read links")
    # time.sleep(10)
    
    while True:
        try:
            # Get all article cards
            cards = driver.find_elements(By.CSS_SELECTOR, 'div[class*="sc-c5803051-0 izsbSA"]')
            # cardstack = driver.find_elements(By.CSS_SELECTOR, 'a[data-testid="internal-link"][class*="sc-c5803051-0 izsbSA"]')
            
            # Extract data from each card
            for card in cards:
                try:
                    url = card.get_attribute('href')
                    # headline = card.find_element(By.CSS_SELECTOR, 'h2[data-testid="card-headline"]').text
                    # category = card.find_element(By.CSS_SELECTOR, 'span[data-testid="card-metadata-tag"]').text
                    
                    articles_data.append({
                        # 'headline': headline,
                        # 'category': category,
                        'url': url
                    })
                except Exception as e:
                    print(f"Error processing card: {str(e)}")
                    continue
            print(f"Collected {len(articles_data)} articles so far")
        
            # Scroll to bottom to expose pagination controls
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            # scroll up a bit
            driver.execute_script("window.scrollBy(0, -300)")
            time.sleep(1)
            
            # Find and click next button
            next_button = wait.until(EC.element_to_be_clickable(
                (By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]:not([disabled])')))
            next_button.click()
            print("Clicked Next Page button")
            
            # Reset scroll position for new page
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(1.5)
            gradual_scroll()  # Find "More" section again
            
        except Exception as e:
            print(f"Pagination ended: {str(e)}")
            break
            
    return articles_data

In [154]:
# Example usage:
url = "https://www.bbc.com/innovation"
articles_data = scrape_articles(url)

Found 'More' section!
Collected 1 articles so far
Clicked Next Page button
Found 'More' section!
Collected 2 articles so far
Clicked Next Page button
Found 'More' section!
Collected 3 articles so far
Clicked Next Page button
Found 'More' section!
Collected 4 articles so far
Clicked Next Page button
Found 'More' section!
Collected 5 articles so far
Clicked Next Page button
Found 'More' section!
Collected 6 articles so far
Clicked Next Page button
Found 'More' section!
Collected 7 articles so far
Clicked Next Page button
Found 'More' section!
Collected 8 articles so far
Clicked Next Page button
Found 'More' section!
Collected 9 articles so far
Clicked Next Page button
Found 'More' section!
Collected 10 articles so far
Clicked Next Page button
Found 'More' section!
Collected 11 articles so far
Pagination ended: Message: 
Stacktrace:
	GetHandleVerifier [0x0129FC03+61635]
	GetHandleVerifier [0x0129FC44+61700]
	(No symbol) [0x010C05D3]
	(No symbol) [0x0110899E]
	(No symbol) [0x01108D3B]
	(No

In [155]:
print(len(articles_data))

11


In [156]:
print(articles_data)
driver.quit()

[{'url': None}, {'url': None}, {'url': None}, {'url': None}, {'url': None}, {'url': None}, {'url': None}, {'url': None}, {'url': None}, {'url': None}, {'url': None}]


In [132]:
# Save to CSV
df = pd.DataFrame(articles_data)
df = df.drop_duplicates(subset=['url'])  # Remove duplicates
df.to_csv('bbc_scraped_news_articles.csv', index=False)
print(f"Saved {len(df)} articles to CSV")

NameError: name 'all_articles' is not defined