In [22]:
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def is_valid_article_link(url):
    # Filter out non-article URLs
    if "beap.gemini.yahoo.com" in url:
        return False
    return True

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 10)
main_page = "https://finance.yahoo.com/topic/stock-market-news"
driver.get(main_page)

# Scroll to the end of the page to ensure all dynamic content is loaded
body = driver.find_element(By.TAG_NAME, 'body')
for _ in range(20):  # We can also increase the number to have more scroll - i.e. more data
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(1)  # Wait for the page to load

articles_data = []

try:
    news_items = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'li[class*="js-stream-content"]')))
    for item in reversed(news_items):  # Iterate from bottom to top
        data = {'title': '', 'link': '', 'first_paragraph': 'First paragraph not available.'}
        try:
            title_element = item.find_element(By.TAG_NAME, 'h3')
            link = title_element.find_element(By.XPATH, './/ancestor::a').get_attribute('href')
            
            if not is_valid_article_link(link):
                print(f"Skipping non-article link: {link}")
                continue  # Skip this iteration and proceed with the next one

            data['title'] = title_element.text
            data['link'] = link

            # Open a new tab to avoid losing the list of news_items
            driver.execute_script("window.open('');")
            driver.switch_to.window(driver.window_handles[1])
            driver.get(link)
            try:
                caas_body = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.caas-body')))
                first_paragraph = caas_body.find_element(By.TAG_NAME, 'p').text
                data['first_paragraph'] = first_paragraph if first_paragraph else "First paragraph not available."
            except (TimeoutException, NoSuchElementException):
                print(f"Content not found or page timed out for: {link}")
            
            articles_data.append(data)
            
            # Close the current tab and switch back to the main page tab
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
        except Exception as e:
            print(f"Error processing article: {e}")
            # Ensure to close any opened tab in case of error and switch back
            if len(driver.window_handles) > 1:
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
finally:
    driver.quit()
    with open('articles_data.json', 'w', encoding='utf-8') as f:
        json.dump(articles_data, f, ensure_ascii=False, indent=4)
    print("Data has been saved to 'articles_data.json'")


Skipping non-article link: https://a.beap.gemini.yahoo.com/mbclk?bv=3.0.0&es=AQAAAJ-1PW32-ATAZWfzqoY-zeY_YPlrQ_fInf3gGKMgnYDJV6YyxNNAorodKC1JFyR_myOnWigHj4qFCc58nMLLX5ov0gUWrfZ4KH0nI96hCVSJTVhVD5VTJ5bQNlNO4YEtiY3D1a00Ojh2-SShUqYYkjqfQsNoYX92orDpzlekJHXIc0AesjjiSfyvc75R2egms_tyOYvDyrZCPqmuOYkWejdRquT0ZL9KnXBN8sPyXWpxNUyu67ZQtDDjG6gQav4wOmizzcGuGgV6z3Njjw1Wehi0cV6lzfq5SovTMXALsq0Ic_X6jM4IeGoNKNuXLhVNKQYJCVVEHJNyMKjIwPWa8mg5Mb1X2_YUho5TT1Pc1NrWdPU_trecNgMFqszDUByQCHeYVkhzIb5NNxvPnEVX4BSVR40Dz0kGd3dKl8QFD2LRSPu1P-880cgFOtkjwXM5PrYkkZ_iUfp19EnersIN7Mhk85bxTkGvFX69U1Eb61uG_2PIAKOE-fWG8IwXlSHsm4A5CSzE469AvsGgYOCNE7YKOLRTIV020wQ_goBKKBZzxz1QOea5KGwxDL1DODIK6UTywfR2MuDPtxgbRc_xdRWDX9D0or8hru9IAVAhDWLhvGYiNU-Zizu5KYp4kpIDqEKk1q7RchJJCTV6ujzcn_1aV7uexf-0-_MJIPclYatzcgwsuMHCfuBz--d-1RxfJxsKjxGbVMnilll8iJYJ-xqdIlP1QpqWQ8wzmAaBSPzZZpXdrbJiiHfj0eGu9k-joF0PjwiADI0qPd_7kkhywcYXK2_3W-NqtjE5PkEy_LpoY0PoYSZAFGJsaXhqL5wVQ5yyQphG0THKxJStalMrsAOp-WWqshVJsIi6yDSQkSbd3jOYIRWq8PmPK7gY-K_C8IW3OVqZ68Am1nYbdw7tK2d