In [3]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# Retreives the Dow Jones Industrial Average 30 stocks from CNBC

options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)

f = open('DJIA-30', 'w')

try:
    driver.get("https://www.cnbc.com/dow-30/")

    symbols = WebDriverWait(driver, 8).until(
        EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@href, "quotes")]'))
    )

    symbol_texts = [symbol.text for symbol in symbols if symbol.text]

    if len(symbol_texts) == 30:
        for symbol in symbol_texts:
            f.write(f'{symbol} ')
    else:
        print('missing symbols')

except Exception as e:
    print(f"Error: {e}")

finally:
    f.close()
    driver.quit()


In [5]:
import finnhub
import os
import json
from datetime import datetime, timedelta
import time

API_KEY = os.environ.get('FINNHUB_API_KEY')
finnhub_client = finnhub.Client(api_key=API_KEY)

with open('DJIA-30', 'r') as file:
    SYMBOLS = file.read().split()

end_date = datetime.today()
from_date = datetime.today() - timedelta(days=365)

stock_data = {symbol: [] for symbol in SYMBOLS}

for SYMBOL in SYMBOLS:
    delta = timedelta(days=30)  # 30 day intervals
    current_date = from_date

    while current_date < end_date:
        to_date = current_date + delta  # End of batch time
        try:
            news_chunk = finnhub_client.company_news(SYMBOL, _from=current_date.strftime('%Y-%m-%d'), to=to_date.strftime('%Y-%m-%d'))
            stock_data[SYMBOL].extend(news_chunk)
        except Exception as e:
            print(f"Error fetching data for {SYMBOL}: {e}")
        
        current_date += delta  # Move to next batch
        time.sleep(1)  # API rate limit

with open('stock_data.json', 'w') as outfile:
    json.dump(stock_data, outfile, indent=4)


In [10]:
import json
import csv
from datetime import datetime

with open('stock_data.json', 'r') as file:
    stock_data = json.load(file)

useless_keys = ['category', 'id', 'image', 'related', 'summary', 'url']
order = ['Date', 'Stock', 'Headline']

cleaned_data = []
for symbol, news_list in stock_data.items():
    for news in news_list:

        news['Date'] = datetime.fromtimestamp(news['datetime']).strftime('%Y-%m-%d')
        news['Stock'] = symbol
        news['Headline'] = news['headline']

        # Removing useless keys
        for key in useless_keys + ['datetime', 'headline', 'source']:
            if key in news:
                del news[key]

        # Reorder columns
        reordered_news = {k: news[k] for k in order if k in news}
        cleaned_data.append(reordered_news)

with open('cleaned_stock_data.csv', 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=order)
    writer.writeheader()
    for data in cleaned_data:
        writer.writerow(data)
