In [58]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time


class YahooFinanceScraper:
    def __init__(self, url="https://finance.yahoo.com/", timeout=20):
        self.url = url
        self.timeout = timeout
        self.driver = webdriver.Chrome()
        self.driver.set_window_position(-1280, 1080)
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, self.timeout)
        self.data = []

    def wait_for_page_ready(self, timeout=5):
        try:
            print("⏳ Waiting for document.readyState ≠ 'loading'...")
            WebDriverWait(self.driver, timeout).until(
                lambda d: d.execute_script("return document.readyState") in ["interactive", "complete"]
            )
            print("✅ Page DOM is ready")
        except TimeoutException:
            print("❌ Timeout waiting for DOM readiness")
            self.driver.quit()
            raise

    def open_homepage(self):
        print(f"Opening URL: {self.url}")
        self.driver.get(self.url)
        self.wait_for_page_ready()

    def navigate_to_trending_tickers(self):
        print("Waiting for 'Markets' tab...")
        markets_tab = self.wait.until(EC.presence_of_element_located((
            By.XPATH, "//header//ul/li[3]/a/span[text()='Markets']"
        )))

        print("Hovering on 'Markets' tab...")
        actions = ActionChains(self.driver)
        actions.move_to_element(markets_tab).perform()

        print("Waiting for and clicking 'Trending Tickers'...")
        trending_link = self.wait.until(EC.element_to_be_clickable((
            By.XPATH, "/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]"
        )))
        trending_link.click()
        self.wait_for_page_ready()
        print("✅ 'Trending Tickers' page loaded successfully.")

    def click_most_active(self):
        print("Clicking on 'Most Active'...")
        most_active = self.wait.until(EC.element_to_be_clickable((
            By.XPATH, "/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]"
        )))
        most_active.click()
        self.wait_for_page_ready()

    def scrape_table_data(self):
        print("📊 Starting to scrape data...")
        while True:
            self.wait.until(EC.presence_of_element_located((
                By.XPATH, '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[2]/div/table'
            )))

            rows = self.driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
            for row in rows:
                values = row.find_elements(By.TAG_NAME, "td")
                stock = {
                    "name": values[1].text,
                    "symbol": values[0].text,
                    "price": values[3].text,
                    "change": values[4].text,
                    "volume": values[6].text,
                    "market_cap": values[8].text,
                    "pe_ratio": values[9].text
                }
                self.data.append(stock)

            try:
                next_button = self.wait.until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Goto next page']"))
                )
                next_button.click()
                time.sleep(1)
            except:
                print("✅ No more pages to scrape.")
                break

    def run(self):
        print("🚀 Launching Yahoo Finance Scraper...")
        self.open_homepage()
        self.navigate_to_trending_tickers()
        self.click_most_active()
        self.scrape_table_data()
        self.driver.quit()
        print(f"✅ Scraping complete. {len(self.data)} records fetched.")
        return self.data


In [59]:
scraper = YahooFinanceScraper()
scraped_data = scraper.run()


🚀 Launching Yahoo Finance Scraper...
Opening URL: https://finance.yahoo.com/
⏳ Waiting for document.readyState ≠ 'loading'...
✅ Page DOM is ready
Waiting for 'Markets' tab...
Hovering on 'Markets' tab...
Waiting for and clicking 'Trending Tickers'...
⏳ Waiting for document.readyState ≠ 'loading'...
✅ Page DOM is ready
✅ 'Trending Tickers' page loaded successfully.
Clicking on 'Most Active'...
⏳ Waiting for document.readyState ≠ 'loading'...
✅ Page DOM is ready
📊 Starting to scrape data...
✅ No more pages to scrape.
✅ Scraping complete. 260 records fetched.


In [61]:
# convert data to DataFrame
import pandas as pd
stocks_df =  (
    pd.DataFrame(scraped_data)
    .apply(lambda col: col.str.strip() if col.dtype == "object" else col)  # Strip whitespace from string columns
    .assign(
        price=lambda df_: pd.to_numeric(df_.price),
        change=lambda df_: pd.to_numeric(df_.change.str.replace("+", "")),
        volume=lambda df_: pd.to_numeric(df_.volume.str.replace("M", "")),
        market_cap=lambda df_: df_.market_cap.apply(
            lambda val: float(val.replace("B", "")) if "B" in val
            else float(val.replace("T", "")) * 1000 if "T" in val
            else float(val.replace("M", "")) / 1000
        ),
        pe_ratio=lambda df_: (
            df_.pe_ratio
            .replace("--", pd.NA)
            .replace(",", "")
            .pipe(lambda x: pd.to_numeric(x, errors='coerce'))
        )
    )
    .rename(columns={"price": "price_usd", "volume": "volume_millions", "market_cap": "market_cap_billions"})
)

In [62]:
stocks_df.to_excel("yahoo-stocks-data.xlsx", index=False)