In [1]:
import yfinance as yf
import pandas as pd
from pyspark.sql.functions import col


symbols = {
    "BTC-USD": "BTC",
    "ETH-USD": "ETH",
    "SOL-USD": "SOL"
}

all_data = []

# Hourly interval with up to 7d range for free access
for yf_symbol, short_symbol in symbols.items():
    ticker = yf.Ticker(yf_symbol)
    hist = ticker.history(period="7d", interval="1h")  # Hourly data for 7 days
    hist = hist.reset_index()
    hist["Symbol"] = short_symbol
    all_data.append(hist)

In [3]:
bronze_df = pd.concat(all_data, ignore_index=True)
bronze_df = bronze_df[["Symbol", "Datetime", "Open", "High", "Low", "Close", "Volume"]]
bronze_df.head()


Unnamed: 0,Symbol,Datetime,Open,High,Low,Close,Volume
0,BTC,2025-09-01 00:00:00+00:00,108250.257812,108358.640625,107695.992188,108297.648438,0
1,BTC,2025-09-01 01:00:00+00:00,108260.054688,108384.953125,107998.773438,108195.773438,219394048
2,BTC,2025-09-01 02:00:00+00:00,108167.195312,108194.625,107444.34375,107628.609375,0
3,BTC,2025-09-01 03:00:00+00:00,107598.8125,107738.507812,107423.632812,107691.960938,1168699392
4,BTC,2025-09-01 04:00:00+00:00,107681.828125,107740.960938,107298.0625,107398.101562,1013510144


In [4]:
def enforce_schema(df):
    schema = {
        "Symbol": "string",
        "Datetime": "datetime64[ns]",
        "Open": "float64",
        "High": "float64",
        "Low": "float64",
        "Close": "float64",   # nullable int
        "Volume": "Int64",
    }

    # apply schema selectively (ignore cols not in dict)
    for col, dtype in schema.items():
        if col in df.columns:
            if dtype.startswith("datetime"):
                df[col] = pd.to_datetime(df[col], errors="coerce")
            else:
                df[col] = df[col].astype(dtype, errors="ignore")
    return df
silver_df = enforce_schema(bronze_df)
silver_df.head()

Unnamed: 0,Symbol,Datetime,Open,High,Low,Close,Volume
0,BTC,2025-09-01 00:00:00+00:00,108250.257812,108358.640625,107695.992188,108297.648438,0
1,BTC,2025-09-01 01:00:00+00:00,108260.054688,108384.953125,107998.773438,108195.773438,219394048
2,BTC,2025-09-01 02:00:00+00:00,108167.195312,108194.625,107444.34375,107628.609375,0
3,BTC,2025-09-01 03:00:00+00:00,107598.8125,107738.507812,107423.632812,107691.960938,1168699392
4,BTC,2025-09-01 04:00:00+00:00,107681.828125,107740.960938,107298.0625,107398.101562,1013510144
