In [4]:
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def is_valid_article_link(url):
    # Filter out non-article URLs
    if "beap.gemini.yahoo.com" in url:
        return False
    return True

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 10)
main_page = "https://finance.yahoo.com/topic/stock-market-news"
driver.get(main_page)

# Scroll to the end of the page to ensure all dynamic content is loaded
# body = driver.find_element(By.TAG_NAME, 'body')
# for _ in range(3):  # Adjust the number for more/less scrolling as needed
#     body.send_keys(Keys.PAGE_DOWN)
#     time.sleep(1)  # Wait for the page to load

articles_data = []

try:
    news_items = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'li[class*="js-stream-content"]')))
    for item in reversed(news_items):  # Iterate from bottom to top for stability
        data = {'title': '', 'link': '', 'content': 'Content not available.'} 
        try:
            title_element = item.find_element(By.TAG_NAME, 'h3')
            link = title_element.find_element(By.XPATH, './/ancestor::a').get_attribute('href')
            
            if not is_valid_article_link(link):
                print(f"Skipping non-article link: {link}")
                continue  # Skip non-article links

            data['title'] = title_element.text
            data['link'] = link

            # Open a new tab to avoid losing the list of news_items
            driver.execute_script("window.open('');")
            driver.switch_to.window(driver.window_handles[1])
            driver.get(link)
            try:
                caas_body = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.caas-body')))
                paragraphs = caas_body.find_elements(By.TAG_NAME, 'p')
                article_text = ' '.join([paragraph.text for paragraph in paragraphs if paragraph.text])
                data['content'] = article_text if article_text else "Content not available."
            except (TimeoutException, NoSuchElementException):
                print(f"Content not found or page timed out for: {link}")
            
            articles_data.append(data)
            
            # Close the current tab and switch back to the main page tab
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
        except Exception as e:
            print(f"Error processing article: {e}")
            # Ensure to close any opened tab in case of error and switch back
            if len(driver.window_handles) > 1:
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
finally:
    driver.quit()
    with open('articles_data.json', 'w', encoding='utf-8') as f:
        json.dump(articles_data, f, ensure_ascii=False, indent=4)
    print("Data has been saved to 'articles_data.json'")


Skipping non-article link: https://a.beap.gemini.yahoo.com/mbclk?bv=3.0.0&es=AQAAAIYNaSJpy4qYp5XCyzea3Q1MxdEi_xl6VAotWSdSxN3A-B5REkpWGGm_cG1mpCb1jpxEBq7PltcDQYaRGMoS9UKdx-ewJFtjXB_pTVEHv_hKSvf-7RqI0LDXDFGEY9BcbxTbAoEaUpohf_2Kg7-WRZG-kjtDsgtoCls_Bowf_CAuXVcN6DzjG9TNq3NTpR5nV7vpGmn0lZVnv7vtvuy_KAcBjBb5a_NqQfEvw_4g3wbW6Nq3dotglOLaFhEdXQr0FP2JtTi3bgxWWThA5YqMzXP8gdpYSmRs9zc39PyMGd01ytClQlYAtwUJZ3dzqFEj53r3d8xOBpRvMSDXYz2pQBMYx0zIK1n4jVZfurF9NKadRDi5plZA5zjSKHT_CWy-tjy2OhS4510mvR6nrYnESdzdK1mgtlUlbkiDMgU2WN-VcS0Cmdrei4kiR-iuH-nd_wKhvhqTSIlxUUZ7GCNV5A4pqfNhx0lCm0RQ60bVPQ21Xu23IH8n_cLAoAF5oMNWNjvwad35sMjfgC9pmJ2ypRcsA8Vws3YJt5WzWmAIZpx5jRDW0vpTdkoH_-ktKqC6VBj23k1N2P_sYgg-4PTsKrhsDHEYypBt9NhAFl4sdEnDp8-w6Skhp21J4I3O4FZOAj1ckc2W8ypf9o9D0II0zr3gfZsKqRGTt7utidPfbJUvuaZCdhPZRGZaD1wgtp7c_OEvtCgZSc5DTJ4UD_3jTYZiVBdG80kELcToVHBNWM4-qr6cTBfKnsuVdLkdUFDectjfS38fdnskbzRrPkiajZbj6LJqaS4QBlLTV2n7jWaFaVA5TfltEcPAafBQE6Yf6ketb-i9yBqAZo3xw94dt3i2QkqyyXTDGagwkdj2dNclEXBgYHG83Cyh-PN1c-9-XzpRiIW1cGoMeN5UBISBDib