## Yahoo Finance Data

This code is used to retrieve the market data for $SPY and put it into a csv

The dates must be aligned with the date horizon for the r/wallstreetbet comment data 

**Note**: There is no need to run this code as the CSV has already been generated

In [10]:
import yfinance as yf
import pandas as pd

TICKER = "SPY"
START_DATE = "2025-12-15"
END_DATE = "2025-12-19"
INTERVAL = "1m"
MARKET_TIMEZONE = 'America/New_York'

def fetch_intraday_data(ticker, start, end, interval):
    data = yf.download(
        tickers=ticker,
        start=start,
        end=end,
        interval=interval,
        progress=False,
        auto_adjust=False
    )

    if data.empty:
        return data

    if isinstance(data.columns, pd.MultiIndex):
        data.columns = data.columns.droplevel(1)

    try:
        if data.index.tz is not None:
            data.index = data.index.tz_convert(MARKET_TIMEZONE)
        else:
            data.index = data.index.tz_localize(MARKET_TIMEZONE, ambiguous='NaT', nonexistent='NaT')
    except Exception:
        data.index = data.index.tz_convert(MARKET_TIMEZONE)

    market_hour = data.index.hour
    market_minute = data.index.minute

    is_after_start = (market_hour > 9) | ((market_hour == 9) & (market_minute >= 30))
    is_before_end = (market_hour < 16) | ((market_hour == 16) & (market_minute == 0))

    data_filtered = data[is_after_start & is_before_end].copy()

    return data_filtered

spy_data = fetch_intraday_data(TICKER, START_DATE, END_DATE, INTERVAL)
spy_data.head()

Price,Adj Close,Close,High,Low,Open,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-12-15 09:30:00-05:00,685.150024,685.150024,685.76001,685.109985,685.73999,2297894
2025-12-15 09:31:00-05:00,685.02002,685.02002,685.23999,684.98999,685.150024,433989
2025-12-15 09:32:00-05:00,685.030029,685.030029,685.419983,685.02002,685.02002,288648
2025-12-15 09:33:00-05:00,684.380005,684.380005,685.049988,684.344971,685.030029,330555
2025-12-15 09:34:00-05:00,683.955017,683.955017,684.386292,683.940002,684.369995,510782


In [11]:
spy_data.columns
spy_data.drop(columns=["Adj Close"], inplace=True)
spy_data.columns = spy_data.columns.str.lower()
spy_data.rename_axis('timestamp', inplace=True)
spy_data.to_csv("data/spy/1215_1219.csv", index=True) # timestamp,open,high,low,close,volume

### Combine Datasets

In [15]:
import glob
import os

input_dir = 'data/spy'

all_files = glob.glob(os.path.join(input_dir, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=False)
df.to_csv("data/full_manual_spy.csv", index=False)