 # Crypto Volatility Forecasting 

## 📦 Imports & Environment

In [14]:
# General Utilities 
import  random, os, pandas as pd, numpy as np
import matplotlib.pyplot as plt, datetime as dt
import dotenv, os, requests, time

# Environment & Dask Client
plt.rcParams['figure.figsize'] = (10,4)
os.makedirs("OutputData", exist_ok=True)
dotenv.load_dotenv(dotenv.find_dotenv(filename=".env"))
from dask.distributed import Client, LocalCluster

# Features 
import talib

# Models
from arch import arch_model
import statsmodels.api as sm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

## 🔧 Configuration

In [15]:
# Key Variables
TARGET_COIN = "ethereum"
BASE_FIAT   = "usd"
TOP_N       = 10
LOOKBACK_DAYS = 365
START_DATE = (dt.datetime.now() - dt.timedelta(days=LOOKBACK_DAYS)).strftime("%Y-%m-%d")
TIMEZONE = "Europe/Madrid"
SMA_WINDOWS = (10, 20, 50)
EMA_WINDOWS = (10, 20, 50)
TRAIN_SPLIT = 0.90
# --- GARCH config ---
GARCH_SCALE = 100.0
GARCH_REFIT_EVERY = 5
SEQ_LEN = 7
# --- LSTM configuration ---
LSTM_UNITS = 64
LSTM_EPOCHS = 25
LSTM_BATCH = 16
# --- Dune API configuration ---
DUNE_QUERIES = {
    "economic_security": 1933076,
    "daily_dex_volume": 4388,
    "btc_etf_flows": 5795477,
    "eth_etf_flows": 5795645,
    "total_defi_users": 2972,
    "median_gas": 2981260,
}
DUNE_API_KEY = os.getenv("DUNE_API_KEY")
DUNE_CSV_PATH = "OutputData/Dune_Metrics.csv"

# --- FRED API configuration ---
FRED_API_KEY= os.getenv("FRED_API_KEY")
FRED_KNOWN = {
    "VIXCLS":   "vix_equity_vol",            # CBOE VIX (Equity market volatility index)
    "MOVE":     "move_bond_vol",             # ICE BofA MOVE Index (Bond market volatility)
    "OVXCLS":   "ovx_oil_vol",               # CBOE Crude Oil Volatility Index (Oil market volatility)
    "GVZCLS":   "gvz_gold_vol",              # CBOE Gold Volatility Index (Gold market volatility)
    "DTWEXBGS": "usd_trade_weighted_index",  # Trade-Weighted U.S. Dollar Index (Broad Goods)
    "DGS2":     "us_2y_treasury_yield",      # U.S. 2-Year Treasury Yield (constant maturity)
    "DGS10":    "us_10y_treasury_yield",     # U.S. 10-Year Treasury Yield (constant maturity)
}

# --- REPRODUCABILITY ---
np.random.seed(42)


##  Utilities: Metrics & Plotting

In [16]:
# MAE & MASE (predicted vs. realized)
def mae(y_true, y_pred):
    y_true = pd.Series(y_true).astype(float).values
    y_pred = pd.Series(y_pred).astype(float).values
    return float(np.mean(np.abs(y_true - y_pred)))

def mase(y_true, y_pred):
    y_true = pd.Series(y_true).astype(float).values
    y_pred = pd.Series(y_pred).astype(float).values
    naive = np.roll(y_true, 1)[1:]
    err_model = np.abs(y_true[1:] - y_pred[1:]).mean()
    err_naive = np.abs(y_true[1:] - naive).mean()
    return float(err_model / err_naive) if err_naive > 0 else np.nan


##  Data Loading



In [18]:
# API Wrappers
CG_BASE = "https://api.coingecko.com/api/v3"
def cg_universe(top_n=TOP_N, vs_currency=BASE_FIAT):
    url = f"{CG_BASE}/coins/markets"
    params = dict(vs_currency=vs_currency, order="market_cap_desc", per_page=top_n, page=1)
    r = requests.get(url, params=params)
    return [d["id"] for d in r.json()]

def cgpriceactiondaily(coins, days=LOOKBACK_DAYS, vs_currency=BASE_FIAT):
    out, idx = {}, None
    for cid in coins:
        try:
            url = f"{CG_BASE}/coins/{cid}/market_chart"
            params = dict(vs_currency=vs_currency, days=days, interval="daily")
            r = requests.get(url, params=params)
            js = r.json()
            p = pd.DataFrame(js.get("prices", []), columns=["ts", f"prices_{cid}"])
            v = pd.DataFrame(js.get("total_volumes", []), columns=["ts", f"total_volumes_{cid}"])
            dfc = pd.merge(p, v, on="ts", how="outer").sort_values("ts")
            dfc["date"] = pd.to_datetime(dfc["ts"], unit="ms").dt.tz_localize("UTC").dt.tz_convert(TIMEZONE).dt.date
            dfc = dfc.drop(columns=["ts"]).groupby("date").last()
            out[cid] = dfc
        except Exception as e:
            print(f"Error for {cid}: {e}")
            continue
        time.sleep(3)
    return pd.concat(out.values(), axis=1, join="outer").sort_index()

DERIBIT_BASE = "https://www.deribit.com/api/v2"
def deribit_dvol_daily_multi(symbols):
    frames = []
    for sym in symbols:
        try:
            r = requests.get(
                            f"{DERIBIT_BASE}/public/get_volatility_index_data", 
                            params={"currency": sym}
                        )
            js = r.json()
            data = js.get("result", js.get("data", {})).get("data", js.get("data", []))
            if isinstance(data, dict) and "data" in data:
                data = data["data"]
            if not data: 
                continue
            df = pd.DataFrame(data)
            ts_col = "timestamp" if "timestamp" in df.columns else "t" if "t" in df.columns else None
            val_col = "value" if "value" in df.columns else "index" if "index" in df.columns else None
            if ts_col is None or val_col is None:
                continue
            df["date"] = pd.to_datetime(df[ts_col], unit="ms").dt.tz_localize("UTC").dt.tz_convert(TIMEZONE).dt.date
            df = df.groupby("date")[val_col].last().to_frame(name=f"dvol_{sym.lower()}")
            frames.append(df)
        except Exception:
            continue
    return pd.concat(frames, axis=1).sort_index() if frames else pd.DataFrame()

DUNE_BASE = "https://api.dune.com/api/v1"
def dune_metrics_daily(query_ids, api_key=None):
    if not query_ids or api_key is None:
        return pd.DataFrame()
    headers = {"X-Dune-Api-Key": api_key}
    cols = []
    for col_name, qid in query_ids.items():
        try:
            r = requests.get(f"{DUNE_BASE}/query/{qid}/results", headers=headers)
            rows = r.json().get("result", {}).get("rows", [])
            if not rows: 
                continue
            df = pd.DataFrame(rows)
            date_col = next((c for c in ["date","day","time","ts","timestamp"] if c in df.columns), df.columns[0])
            df["date"] = pd.to_datetime(df[date_col]).dt.tz_localize(None).dt.date
            val_col = next((c for c in df.columns if c != "date" and pd.api.types.is_numeric_dtype(df[c])), None)
            if val_col is None:
                for c in df.columns:
                    if c != "date":
                        df[c] = pd.to_numeric(df[c], errors="coerce")
                        if df[c].notna().any():
                            val_col = c; break
            if val_col is None: 
                continue
            cols.append(df[["date", val_col]].groupby("date").last().rename(columns={val_col: col_name}))
        except Exception:
            continue
    return pd.concat(cols, axis=1).sort_index() if cols else pd.DataFrame()

FRED_BASE = "https://api.stlouisfed.org/fred/series/observations"
def fetch_fred_series_df(series, api_key=None, start_date= START_DATE):
    if not series or api_key is None:
        return pd.DataFrame()
    frames = []
    for fred_id, col_name in series.items():
        try:
            params = {"series_id": fred_id, "api_key": api_key, "file_type": "json", "observation_start": START_DATE}
            r = requests.get(FRED_BASE, params=params)
            obs = r.json().get("observations", [])
            if not obs: 
                continue
            df = pd.DataFrame(obs)[["date","value"]]
            df["date"] = pd.to_datetime(df["date"]).dt.date
            df["value"] = pd.to_numeric(df["value"], errors="coerce")
            frames.append(df.groupby("date")["value"].last().to_frame(name=col_name))
        except Exception:
            continue
    return pd.concat(frames, axis=1).sort_index().ffill() if frames else pd.DataFrame()


In [19]:
# Technical Analysis Indicators
def compute_ta_indicators(df, price_prefix="prices_", rsi_period=14,
                          macd_fast=12, macd_slow=26, macd_signal=9,
                          sma_windows=(10,20,50), ema_windows=(10,20,50)):
    out = pd.DataFrame(index=df.index)
    price_cols = [c for c in df.columns if c.startswith(price_prefix)]
    if not price_cols: return out
    coins = [c[len(price_prefix):] for c in price_cols]
    for coin in coins:
        p = pd.to_numeric(df[f"{price_prefix}{coin}"], errors="coerce")
        out[f"rsi{rsi_period}_{coin}"] = talib.RSI(p.values, timeperiod=rsi_period)
        macd, macd_sig, macd_hist = talib.MACD(p.values, fastperiod=macd_fast, slowperiod=macd_slow, signalperiod=macd_signal)
        out[f"macd_{coin}"] = macd; out[f"macd_signal_{coin}"] = macd_sig; out[f"macd_hist_{coin}"] = macd_hist
        for w in sma_windows: out[f"sma{w}_{coin}"] = talib.SMA(p.values, timeperiod=w)
        for w in ema_windows: out[f"ema{w}_{coin}"] = talib.EMA(p.values, timeperiod=w)
    out.index = df.index
    return out


##  GARCH(1,1) & HAR-RV 

In [20]:
# HAR & GARCH
def fit_garch_11(returns, scale=100.0):
    r = returns.dropna().astype(float) * scale
    am = arch_model(r, mean="Zero", vol="GARCH", p=1, q=1, dist="normal")
    return am.fit(disp="off")
def forecast_garch_rolling(returns, train_size, scale=100.0, refit_every=0):
    r = returns.dropna().astype(float); idx=r.index; n=len(r)
    preds, pred_idx, last_refit, res = [], [], -1, None
    for t in range(train_size, n):
        if (res is None) or (refit_every==0) or ((t-last_refit)>=refit_every):
            res = fit_garch_11(r.iloc[:t], scale=scale); last_refit = t
        var_next = res.forecast(horizon=1).variance.iloc[-1,0]
        preds.append(float(np.sqrt(var_next))/scale); pred_idx.append(idx[t])
    return pd.Series(preds, index=pred_idx, name="garch11_vol_pred")
def _har_features(rv):
    rv = rv.astype(float)
    return pd.DataFrame({"RV1": rv.shift(1), "RV5": rv.shift(1).rolling(5).mean(), "RV22": rv.shift(1).rolling(22).mean()})
def fit_har_ols(rv_train):
    X = _har_features(rv_train); y = rv_train
    df = pd.concat([X, y.rename("y")], axis=1).dropna()
    model = sm.OLS(df["y"], sm.add_constant(df[["RV1","RV5","RV22"]])).fit()
    p = model.params.to_dict()
    return {"const": p.get("const",0.0), "RV1": p["RV1"], "RV5": p["RV5"], "RV22": p["RV22"]}
def forecast_har(rv, params, start_idx):
    rv = rv.astype(float); idx = rv.index; n=len(rv); preds=[]
    for t in range(start_idx, n):
        rv1 = rv.iloc[t-1] if t-1>=0 else np.nan
        rv5 = rv.iloc[max(0,t-5):t].mean()
        rv22 = rv.iloc[max(0,t-22):t].mean()
        preds.append(float(np.dot([1.0, rv1, rv5, rv22], [params["const"], params["RV1"], params["RV5"], params["RV22"]])))
    return pd.Series(preds, index=idx[start_idx:], name="har_vol_pred")


##  LSTM 

In [21]:
# LSTM
def make_univariate_sequences(series, seq_len):
    v = series.dropna().astype(float).values
    X, y = [], []
    for i in range(len(v)-seq_len):
        X.append(v[i:i+seq_len]); y.append(v[i+seq_len])
    return np.asarray(X,float).reshape(-1,seq_len,1), np.asarray(y,float)
def build_lstm_model(input_shape, units=64, dropout=0.0):
    m = Sequential(); m.add(LSTM(units, input_shape=input_shape))
    if dropout>0: m.add(Dropout(dropout))
    m.add(Dense(1)); m.compile(optimizer="adam", loss="mse"); return m
def train_lstm(model, X_train, y_train, epochs=25, batch_size=16, validation_split=0.1, patience=5):
    cb=[EarlyStopping(monitor="val_loss", patience=patience, restore_best_weights=True)] if validation_split and patience else []
    return model.fit(X_train,y_train,epochs=epochs,batch_size=batch_size,validation_split=validation_split,callbacks=cb,verbose=0,shuffle=False)
def predict_lstm(model, X_seq): return model.predict(X_seq, verbose=0).reshape(-1)


In [22]:
coins = cg_universe(TOP_N, vs_currency=BASE_FIAT)
O = cgpriceactiondaily(coins, days=LOOKBACK_DAYS, vs_currency=BASE_FIAT)
D_dvol = deribit_dvol_daily_multi(['BTC','ETH'])
# U_dune = dune_metrics_daily(DUNE_QUERIES, DUNE_API_KEY) 
U_dune= pd.read_csv(DUNE_CSV_PATH).set_index('date')
U_dune.index =pd.to_datetime(U_dune.index)   



In [None]:
M_fred = fetch_fred_series_df(FRED_KNOWN, FRED_API_KEY, START_DATE)   
df = O.copy()
for extra in [D_dvol, U_dune, M_fred]:
    if extra is not None and not extra.empty:
        df = df.join(extra, how="outer")
df = df.sort_index().ffill().dropna(how="all")

In [24]:
O.dropna(axis=1, thresh=int(0.1*(len(O))))

Unnamed: 0_level_0,prices_bitcoin,total_volumes_bitcoin,prices_ethereum,total_volumes_ethereum,prices_tether,total_volumes_tether,prices_ripple,total_volumes_ripple
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-09-27,65130.768840,3.799557e+10,2630.949837,1.668616e+10,1.000071,6.199968e+10,0.590002,1.320296e+09
2024-09-28,65791.002125,3.266492e+10,2698.192821,1.646812e+10,1.000440,3.959733e+10,0.589115,1.401604e+09
2024-09-29,65934.107094,1.534291e+10,2680.218702,9.607073e+09,1.000640,2.389775e+10,0.615009,2.770313e+09
2024-09-30,65663.689867,1.294871e+10,2659.611212,9.570811e+09,0.999926,3.519751e+10,0.642592,2.594473e+09
2024-10-01,63243.275325,3.512445e+10,2597.341152,1.753926e+10,1.000155,6.021731e+10,0.611934,2.572234e+09
...,...,...,...,...,...,...,...,...
2025-09-22,115304.479994,1.865911e+10,4452.871130,1.591068e+10,1.000400,5.412936e+10,2.973350,2.935908e+09
2025-09-23,112696.741017,6.915234e+10,4199.951774,5.298809e+10,1.000815,1.486343e+11,2.851139,9.546312e+09
2025-09-24,112022.165879,4.615488e+10,4166.190550,2.990268e+10,1.000206,9.586917e+10,2.829056,5.177734e+09
2025-09-25,113320.569085,4.667754e+10,4148.656828,2.998892e+10,1.000336,9.576239e+10,2.928589,6.006922e+09


In [None]:
ta_df = compute_ta_indicators(df, price_prefix="prices_", sma_windows=SMA_WINDOWS, ema_windows=EMA_WINDOWS)
df = df.join(ta_df).dropna()

df[f"log_returns_{TARGET_COIN}"] = np.log(df[f"prices_{TARGET_COIN}"]).diff()
df[f"realized_vol_{TARGET_COIN}"] = df[f"log_returns_{TARGET_COIN}"].abs()

X_full = df.diff().dropna()
y_full = df[f"realized_vol_{TARGET_COIN}"].shift(-1).dropna()
common = X_full.index.intersection(y_full.index)
X_full = X_full.loc[common]; y_full = y_full.loc[common]

split = int(TRAIN_SPLIT * len(X_full))
X_train, X_test = X_full.iloc[:split], X_full.iloc[split:]
y_train, y_test = y_full.iloc[:split], y_full.iloc[split:]


In [None]:
returns = df[f"log_returns_{TARGET_COIN}"].loc[y_full.index].dropna()
train_size_r = int(TRAIN_SPLIT * len(returns))
garch_preds = forecast_garch_rolling(returns, train_size=train_size_r, scale=GARCH_SCALE, refit_every=GARCH_REFIT_EVERY)
garch_preds = garch_preds.reindex(y_test.index).dropna()
print("GARCH MAE:", round(mae(y_test.loc[garch_preds.index], garch_preds), 6))


In [None]:
rv = df[f"realized_vol_{TARGET_COIN}"].loc[y_full.index].dropna()
har_params = fit_har_ols(rv.iloc[:split])
har_preds = forecast_har(rv, har_params, start_idx=split).reindex(y_test.index).dropna()
print("HAR MAE:", round(mae(y_test.loc[har_preds.index], har_preds), 6))


In [None]:
rv_all = df[f"realized_vol_{TARGET_COIN}"].loc[y_full.index]
rv_tr, rv_te = rv_all.iloc[:split], rv_all.iloc[split:]
X_tr_seq, y_tr_seq = make_univariate_sequences(rv_tr, seq_len=SEQ_LEN)
X_te_seq, y_te_seq = make_univariate_sequences(pd.concat([rv_tr.iloc[-SEQ_LEN:], rv_te]), seq_len=SEQ_LEN)
model = build_lstm_model(input_shape=(SEQ_LEN,1), units=LSTM_UNITS, dropout=0.0)
_ = train_lstm(model, X_tr_seq, y_tr_seq, epochs=LSTM_EPOCHS, batch_size=LSTM_BATCH, validation_split=0.1, patience=5)
lstm_preds = predict_lstm(model, X_te_seq)
y_align = y_test.iloc[len(y_test)-len(lstm_preds):]
print("LSTM MAE:", round(mae(y_align, lstm_preds), 6))
