In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime, timedelta

In [64]:
NUM_OF_PAGES = 25
NUM_OF_ELEMENTS = 10
BASE_HTML_PATH = "/html/body/div[2]/div/main/div[2]/article/div/div[2]/div/div[2]/div[2]/div[2]/ul"
URL = "https://www.nasdaq.com/market-activity/stocks/amzn/news-headlines"

In [None]:
# Initializing the Chrome WebDriver
driver = webdriver.Chrome()

# Navigating to the target URL
driver.get(URL)

In [None]:
# Creating an empty DataFrame to strone news headlines and dates
df = pd.DataFrame(columns=["news_header", "news_date"])

for _ in range(0, NUM_OF_PAGES):

    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located(
            (
                By.XPATH,
                f"{BASE_HTML_PATH}/li[{NUM_OF_ELEMENTS}]/div[1]/a/span"
            )
        )
    )

    for i in range(1, NUM_OF_ELEMENTS + 1):
        try:
            # Locating and extracting the news headline
            xpath_news = f"{BASE_HTML_PATH}/li[{i}]/div[1]/a/span"
            element_news = driver.find_element(By.XPATH, xpath_news)
            element_text = element_news.text

            # Locating and extracting the publication date
            xpath_date = f"{BASE_HTML_PATH}/li[{i}]/div[1]/div/span[1]"
            element_date = driver.find_element(By.XPATH, xpath_date)
            element_date = element_date.text

            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        {"news_header": [element_text], "news_date": [element_date]}
                    ),
                ],
                ignore_index=True,
            )

        except NoSuchElementException:
            break

    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "pagination__next"))
    )
    driver.find_element(By.CSS_SELECTOR, ".pagination__next").click()

In [68]:
def _split_date(date: str) -> str:
    return int(date.split()[0])

def convert_to_datetime(date_str):
    # Checking if it is full date
    try:
        return pd.to_datetime(date_str, format="%b %d, %Y")
    except (ValueError, TypeError):
        # Getting current time
        now = datetime.now()

        if "minute" in date_str:
            minutes = _split_date(date_str)
            return now - timedelta(minutes=minutes)
        elif "hour" in date_str:
            hours = _split_date(date_str)
            return now - timedelta(hours=hours)
        elif "day" in date_str:
            days = _split_date(date_str)
            return now - timedelta(days=days)

        return now  # Returning current date if format unknown

In [69]:
df["news_date"] = df["news_date"].apply(convert_to_datetime)
df["news_date"] = df["news_date"].dt.strftime("%Y-%m-%d")

In [70]:
# print dataframe for debug purposes
df

Unnamed: 0,news_header,news_date
0,The Bull Market Keeps Growing: 3 Reasons to Bu...,2024-11-24
1,Target Stock Keeps Missing the Mark. Is It Tim...,2024-11-24
2,"Could Investing $10,000 in QuantumScape Make Y...",2024-11-24
3,1 Monster Growth Stock Down 70% to Buy Right Now,2024-11-24
4,3 No-Brainer Fintech Stocks to Buy Right Now f...,2024-11-23
...,...,...
245,Rob Sanderson Beats the Drum on Amazon Stock,2024-11-10
246,Box Office Battle: ‘Venom: The Last Dance’ sta...,2024-11-10
247,"Will These 4 ""Magnificent Seven"" Tech Stocks G...",2024-11-10
248,Magnificent Seven: Unstoppable Tech Giants or ...,2024-11-10


In [None]:
# Saving results to CSV file
df.to_csv("DailyNewsHSTO", sep=";")

In [63]:
driver.quit()