In [1]:
from pathlib import Path

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent

DATA_DIR = PROJECT_ROOT / "data"
RAW_EQUITIES_DIR = DATA_DIR /"raw" / "equities"

print(PROJECT_ROOT)
print(RAW_EQUITIES_DIR)

c:\Users\axelz\Programming\Quant-ML\Quant-ML
c:\Users\axelz\Programming\Quant-ML\Quant-ML\data\raw\equities


In [None]:
import sys
import requests
import pandas as pd
from io import StringIO

url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

resp = requests.get(url, headers=headers)
resp.raise_for_status()

html = resp.text
tables = pd.read_html(StringIO(html))

tickers = (
    tables[1]["Symbol"]
    .astype(str)
    .str.upper()
    .str.strip()
    .str.replace(".", "-", regex=False)
)

print(list(tickers))


c:\Users\axelz\Programming\Quant-ML\Quant-ML\venv\Scripts\python.exe
['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A', 'APD', 'ABNB', 'AKAM', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AEE', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'AME', 'AMGN', 'APH', 'ADI', 'AON', 'APA', 'APO', 'AAPL', 'AMAT', 'APP', 'APTV', 'ACGL', 'ADM', 'ANET', 'AJG', 'AIZ', 'T', 'ATO', 'ADSK', 'ADP', 'AZO', 'AVB', 'AVY', 'AXON', 'BKR', 'BALL', 'BAC', 'BAX', 'BDX', 'BRK-B', 'BBY', 'TECH', 'BIIB', 'BLK', 'BX', 'XYZ', 'BK', 'BA', 'BKNG', 'BSX', 'BMY', 'AVGO', 'BR', 'BRO', 'BF-B', 'BLDR', 'BG', 'BXP', 'CHRW', 'CDNS', 'CPT', 'CPB', 'COF', 'CAH', 'CCL', 'CARR', 'CAT', 'CBOE', 'CBRE', 'CDW', 'COR', 'CNC', 'CNP', 'CF', 'CRL', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CB', 'CHD', 'CI', 'CINF', 'CTAS', 'CSCO', 'C', 'CFG', 'CLX', 'CME', 'CMS', 'KO', 'CTSH', 'COIN', 'CL', 'CMCSA', 'CAG', 'COP', 'ED', 'STZ', 'CEG', 'COO', 'CPRT', 'GLW', 'CPAY', 'CTVA', 'CSGP', 'COST', 'CTRA

In [None]:
import yfinance as yf

for ticker in tickers:
    df = yf.download(
        ticker,
        period="20y",
        interval="1d",
        auto_adjust=False,
        progress=False,
        )
    
    if df.empty:
        print(F"Warning: no data for {ticker}")
        continue

    out_path = os.path.join(RAW_EQUITIES_DIR, f"{ticker}.csv")
    print(out_path)
    df.to_csv(out_path)
    print(f"Saved {ticker} to {out_path}")
    


In [4]:
import pandas as pd

frames = []
for path in RAW_EQUITIES_DIR.glob("*.csv"):
    ticker = path.stem
    df = pd.read_csv(path)

    df["date"] = pd.to_datetime(df["Price"].astype(str), format="%Y-%m-%d", errors="coerce")
    df = df[~df["date"].isna()].copy()

    # convert numeric columns
    df["adj_close"] = pd.to_numeric(df["Adj Close"])
    df["close"] = pd.to_numeric(df["Close"])
    df["open"] = pd.to_numeric(df["Open"])
    df["high"] = pd.to_numeric(df["High"])
    df["low"] = pd.to_numeric(df["Low"])
    df["volume"] = pd.to_numeric(df["Volume"])

    # add ticker
    df["ticker"] = ticker

    df = df[["date", "ticker", "adj_close", "close", "open", "high", "low", "volume"]]

    frames.append(df)

# combine all tickers
prices = pd.concat(frames, ignore_index=True)
prices = prices.sort_values(["ticker", "date"])

# save processed panel
PROCESSED_EQUITIES_DIR = PROJECT_ROOT / "data" / "processed" / "equities"
prices.to_parquet(PROCESSED_EQUITIES_DIR / "prices.parquet")

