# Crypto Volatility Forecasting — Consolidated & Editable Notebook
This notebook centralizes **all editable functions** and a clean end-to-end workflow:

- Data loading (CoinGecko, Deribit DVOL, Dune, FRED) — plug in your keys/headers.

- TA-Lib indicators (RSI, MACD, SMA/EMA) computed **pre–feature-matrix**.

- Feature matrix construction (first differences) and 1-day ahead **target**.

- Baselines: **Naive**, **GARCH(1,1)**, **HAR-RV**.

- Deep Learning: **LSTM** baseline.

- (Optional) **Tsfresh + XGBoost** hook if you want to run the original pipeline.


Everything is defined inline so you can edit live. Mirror modules exist under `apiwrappers/` and `extended_models/` for reuse.


## 📦 Imports & Environment

In [2]:
# General Utilities 
import  random, os, pandas as pd, numpy as np
import matplotlib.pyplot as plt, datetime as dt
import dotenv, os

# Environment & Dask Client
plt.rcParams['figure.figsize'] = (10,4)
os.makedirs("OutputData", exist_ok=True)
dotenv.load_dotenv(dotenv.find_dotenv(filename=".env"))
from dask.distributed import Client, LocalCluster

# API Calls: Data Collection & Standardization
import api_wrappers as aw

# Tsfresh-Xgboost-Optuna Pipeline
import tsxg_pipeline as tsxg


2025-09-26 12:45:33,535 INFO numba.cuda.cudadrv.driver init


## 🔧 Configuration

In [3]:
# --- Core configuration  ---
TARGET_COIN = 'ethereum'        # coin to predict volatility for
TOP_N = 10                      # top N coins by market cap (CoinGecko universe)
DAYS_BACK = 365                 # history depth
TIMEZONE = 'Europe/Madrid'      # 
TRAIN_FRACTION = 0.9            # train/test split
MAX_TIMESHIFT = 5
TSFRESH_PRESET = "Minimal"
FDR_LEVEL = 0.05

# --- Dune API configuration ---
DUNE_QUERIES = {
    "economic_security": 1933076,
    "daily_dex_volume": 4388,
    "btc_etf_flows": 5795477,
    "eth_etf_flows": 5795645,
    "total_defi_users": 2972,
    "median_gas": 2981260,
}
DUNE_API_KEY = os.getenv("DUNE_API_KEY")
DUNE_CSV_PATH = "OutputData/Dune_Metrics.csv"

# --- FRED API configuration ---
FRED_API_KEY= os.getenv("FRED_API_KEY")
FRED_KNOWN = {
    "VIXCLS":   "vix_equity_vol",            # CBOE VIX (Equity market volatility index)
    "MOVE":     "move_bond_vol",             # ICE BofA MOVE Index (Bond market volatility)
    "OVXCLS":   "ovx_oil_vol",               # CBOE Crude Oil Volatility Index (Oil market volatility)
    "GVZCLS":   "gvz_gold_vol",              # CBOE Gold Volatility Index (Gold market volatility)
    "DTWEXBGS": "usd_trade_weighted_index",  # Trade-Weighted U.S. Dollar Index (Broad Goods)
    "DGS2":     "us_2y_treasury_yield",      # U.S. 2-Year Treasury Yield (constant maturity)
    "DGS10":    "us_10y_treasury_yield",     # U.S. 10-Year Treasury Yield (constant maturity)
}
FRED_START_DATE = (dt.datetime.now() - dt.timedelta(days=DAYS_BACK)).strftime("%Y-%m-%d")

# --- LSTM configuration ---
LSTM_SEQ_LEN = 7
LSTM_UNITS = 64
LSTM_EPOCHS = 25
LSTM_BATCH = 16

# --- GARCH config ---
GARCH_SCALE = 100.0             # scale returns to % for stability
GARCH_REFIT_EVERY = 5           # refit frequency in days (0 = refit daily)

# --- Flags ---
USE_SYNTHETIC_DATA = False      # switch to True to demo without hitting APIs
RUN_TSFRESH_XGB = False         # set True if you want to run original pipeline
RANDOM_SEED = 42

# Reproducibility
np.random.seed(RANDOM_SEED); random.seed(RANDOM_SEED)


## 🧰 Utilities: Metrics & Plotting

In [4]:
def mae(y_true, y_pred): 
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred); 
    return float(np.mean(np.abs(y_true - y_pred)))

def mase(y_true, y_pred, y_naive):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred); y_naive = np.asarray(y_naive)
    denom = np.mean(np.abs(y_true - y_naive)) + 1e-12
    return float(np.mean(np.abs(y_true - y_pred)) / denom)

def r2(y_true, y_pred):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    ss_res = np.sum((y_true - y_pred)**2); ss_tot = np.sum((y_true - np.mean(y_true))**2) + 1e-12
    return float(1 - ss_res/ss_tot)

def plot_pred_vs_actual(index, y_true, y_pred, title):
    plt.figure()
    plt.plot(index, y_true, label='Actual')
    plt.plot(index, y_pred, label='Predicted')
    plt.title(title); plt.legend(); plt.show()


## 📥 Data Loading
Plug in your existing API wrapper calls here. This cell defines **editable** functions.


In [5]:
import requests
from dune_client.client import DuneClient
from dune_client.query import QueryBase
import time
# expects these globals to be defined by the notebook:
# TIMEZONE, DAYS_BACK, CG_TOP_N, CG_HEADERS,
# DUNE_CSV_PATH, FRED_API_KEY (env)

# --- CoinGecko ---
def cg_universe(n, cg_headers):
    url = "https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd"
    js = requests.get(url, headers=cg_headers).json()
    df = pd.DataFrame(js)
    uni = df.head(n)['id'].values
    return uni

def cgpriceactiondaily(coins, days, timezone, cg_headers):
    end   = int(dt.datetime.now(dt.timezone.utc).timestamp()) * 1000
    start = int((dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=days)).timestamp()) * 1000
    count= 0
    for c in coins:
        try:
            url = f"https://api.coingecko.com/api/v3/coins/{c}/market_chart/range?vs_currency=usd&from={start}&to={end}"
            js = requests.get(url, headers=cg_headers).json()
            p = pd.DataFrame(js["prices"],        columns=["t", f"prices_{c}"])
            m = pd.DataFrame(js["market_caps"],   columns=["t", f"marketcaps_{c}"])
            v = pd.DataFrame(js["total_volumes"], columns=["t", f"total_volumes_{c}"])
            df = p.merge(m, on="t").merge(v, on="t")
            df["t"] = pd.to_datetime(df["t"], unit="ms", utc=True)
            df = df.set_index("t")
            df.columns = [x.lower() for x in df.columns]
            df.index = df.index.tz_convert(timezone).tz_localize(None)
            df = df.resample("1D").last().dropna(how="any")
            df.index.name = "date"
            if count ==0: out = df
            else: out = out.join(df, how='inner')
            count= count+1
        except Exception as e:
            print(f"Error for {c}: {e}")
            continue
        time.sleep(2)  # Add delay to avoid rate limits
    return out
# --- Deribit DVOL ---
def deribit_dvol_daily_multi(currencies, days, timezone, resolution="1D"):
    out = None
    end   = int(dt.datetime.now(dt.timezone.utc).timestamp()) * 1000
    start = int((dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=days)).timestamp()) * 1000
    count=0
    for cur in currencies:
        js = requests.post(
            "https://www.deribit.com/api/v2/",
            json={"method": "public/get_volatility_index_data",
                    "params": {"currency": cur, "resolution": resolution,
                                "end_timestamp": end, "start_timestamp": start}}
        ).json()
        data = js.get("result", {}).get("data", [])
        if not data:
            continue
        d = pd.DataFrame(data, columns=["t","open","high","low","dvol"])
        d["t"] = pd.to_datetime(d["t"], unit="ms", utc=True)
        df = d.set_index("t")[["dvol"]].rename(columns={"dvol": f"dvol_{cur.lower()}"})
        df.index = df.index.tz_convert('Europe/Madrid').tz_localize(None)
        df = df.resample("1D").last().dropna(how="any")
        df.index.name = "date"
        if count ==0: out = df
        else: out = out.join(df, how='inner')
        count= count+1
    return out

# --- Dune (CSV) ---
def dune_metrics_daily(path, timezone):
    if not os.path.exists(path):
        return pd.DataFrame()
    df = pd.read_csv(path, index_col=None)
    dt_col = None
    for c in df.columns:
        try:
            pd.to_datetime(df[c], utc=True, errors="raise")
            dt_col = c
            break
        except Exception:
            continue
    if dt_col is None and "date" in df.columns:
        dt_col = "date"
    if dt_col is None:
        return pd.DataFrame()
    df = df.rename(columns={dt_col: "date"})
    df["date"] = pd.to_datetime(df["date"], utc=True, errors="coerce")
    df = df.set_index("date")
    df.index = df.index.tz_convert(timezone).tz_localize(None)
    df.columns = [c.lower() for c in df.columns]
    df.index.name = "date"
    df = df.resample("1D").last().dropna(how="any")
    return df

# --- Dune (API) ---
def fetch_dune_queries_df(query_ids, timezone, dune_api_key=None):
    dune = DuneClient(api_key=dune_api_key or os.environ.get("DUNE_API_KEY"),
                      request_timeout=300, base_url="https://api.dune.com")
    out = None
    for qid in query_ids:
        try:
            q = QueryBase(query_id=qid)
            df = dune.run_query_dataframe(query=q, ping_frequency=2, batch_size=365)
            ok = False
            for col in list(df.columns):
                try:
                    pd.to_datetime(df[col], utc=True, errors="raise")
                    df = df.rename(columns={col: "date"}).set_index("date")
                    ok = True
                    break
                except Exception:
                    continue
            if not ok and not isinstance(df.index, pd.DatetimeIndex):
                continue
            if isinstance(df.index, pd.DatetimeIndex):
                df.index = df.index.tz_convert(timezone).tz_localize(None)
            df.columns = [c.lower() for c in df.columns]
            df.index.name = "date"
            df = df.resample("1D").last().dropna(how="any")
            out = df if out is None else out.join(df, how="inner")
        except Exception:
            continue
    return out if out is not None else pd.DataFrame()

# --- FRED ---
def fetch_fred_series_df(series_ids, start, timezone, fred_api_key=None):
    key = fred_api_key or os.getenv("FRED_API_KEY")
    if not key:
        return pd.DataFrame()
    base = "https://api.stlouisfed.org/fred/series/observations"
    out = None
    for sid in series_ids:
        try:
            js = requests.get(base, params={
                "series_id": sid, "api_key": key, "file_type": "json",
                "observation_start": start
            }).json()
            obs = js.get("observations", [])
            if not obs:
                continue
            df = pd.DataFrame(obs)[["date","value"]]
            df["date"]  = pd.to_datetime(df["date"], utc=True, errors="coerce")
            df["value"] = pd.to_numeric(df["value"].replace(".", np.nan), errors="coerce")
            df = df.set_index("date").rename(columns={"value": sid.lower()})
            df.index = df.index.tz_convert(timezone).tz_localize(None)
            df = df.resample("1D").last().dropna(how="any")
            df.index.name = "date"
            out = df if out is None else out.join(df, how="inner")
        except Exception:
            continue
    if out is not None and {"dgs10","dgs2"}.issubset(out.columns):
        out["term_spread_10y_2y"] = out["dgs10"] - out["dgs2"]
    return out if out is not None else pd.DataFrame()

## 📈 TA-Lib Indicators — Inline Functions

In [6]:
# apiwrappers/indicators_talib.py
import talib

def compute_ta_indicators(
    df: pd.DataFrame,
    price_prefix: str = "prices_",
    rsi_period: int = 14,
    macd_fast: int = 12,
    macd_slow: int = 26,
    macd_signal: int = 9,
    sma_windows: tuple[int, ...] = (10, 20, 50),
    ema_windows: tuple[int, ...] = (10, 20, 50),
) -> pd.DataFrame:
    """Compute TA-Lib indicators for each symbol column in `df` with name starting by `price_prefix`.
    Returns a DataFrame with columns like: rsi{p}_{coin}, macd_{coin}, macd_signal_{coin}, macd_hist_{coin}, sma{w}_{coin}, ema{w}_{coin}
    """
    out = pd.DataFrame(index=df.index)
    price_cols = [c for c in df.columns if c.startswith(price_prefix)]
    if not price_cols:
        return out
    coins = [c[len(price_prefix):] for c in price_cols]
    for coin in coins:
        p = pd.to_numeric(df[f"{price_prefix}{coin}"], errors="coerce")
        out[f"rsi{rsi_period}_{coin}"] = talib.RSI(p.values, timeperiod=rsi_period)
        macd, macd_sig, macd_hist = talib.MACD(p.values, fastperiod=macd_fast, slowperiod=macd_slow, signalperiod=macd_signal)
        out[f"macd_{coin}"] = macd
        out[f"macd_signal_{coin}"] = macd_sig
        out[f"macd_hist_{coin}"] = macd_hist
        for w in sma_windows:
            out[f"sma{w}_{coin}"] = talib.SMA(p.values, timeperiod=w)
        for w in ema_windows:
            out[f"ema{w}_{coin}"] = talib.EMA(p.values, timeperiod=w)
    out.index = df.index
    return out


## 📊 GARCH(1,1) & HAR-RV — Inline Functions

In [7]:
# extended_models/models_garch_har.py
from arch import arch_model
import statsmodels.api as sm

def fit_garch_11(returns: pd.Series, scale: float = 100.0):
    """Fit GARCH(1,1) to returns. Returns arch result object."""
    r = returns.dropna().astype(float) * scale
    am = arch_model(r, mean="Zero", vol="GARCH", p=1, q=1, dist="normal")
    res = am.fit(disp="off")
    return res

def forecast_garch_rolling(returns: pd.Series, train_size: int, scale: float = 100.0, refit_every: int = 0) -> pd.Series:
    """Rolling one-step-ahead volatility forecast with GARCH(1,1)."""
    r = returns.dropna().astype(float)
    idx = r.index; n = len(r)
    assert 0 < train_size < n
    preds, pred_idx = [], []
    last_refit = -1; res = None
    for t in range(train_size, n):
        if (res is None) or (refit_every == 0) or ((t - last_refit) >= refit_every):
            res = fit_garch_11(r.iloc[:t], scale=scale); last_refit = t
        fcast = res.forecast(horizon=1)
        var_next = fcast.variance.iloc[-1, 0]
        vol_next = float(np.sqrt(var_next)) / scale
        preds.append(vol_next); pred_idx.append(idx[t])
    return pd.Series(preds, index=pred_idx, name="garch11_vol_pred")

def _har_features(rv: pd.Series) -> pd.DataFrame:
    rv = rv.astype(float)
    RV1 = rv.shift(1)
    RV5 = rv.shift(1).rolling(5).mean()
    RV22 = rv.shift(1).rolling(22).mean()
    return pd.DataFrame({"RV1": RV1, "RV5": RV5, "RV22": RV22})

def fit_har_ols(rv_train: pd.Series):
    """Fit HAR-RV via OLS; returns dict of coefficients."""
    X = _har_features(rv_train)
    y = rv_train
    df = pd.concat([X, y.rename("y")], axis=1).dropna()
    X_ = sm.add_constant(df[["RV1", "RV5", "RV22"]]); y_ = df["y"]
    model = sm.OLS(y_, X_).fit()
    p = model.params.to_dict()
    return {"const": p.get("const", 0.0), "RV1": p["RV1"], "RV5": p["RV5"], "RV22": p["RV22"]}

def forecast_har(rv: pd.Series, params: Dict[str, float], start_idx: int) -> pd.Series:
    """One-step-ahead HAR predictions from rv[start_idx:]."""
    rv = rv.astype(float); idx = rv.index; n = len(rv); preds = []
    for t in range(start_idx, n):
        rv1 = rv.iloc[t-1] if t-1 >= 0 else np.nan
        rv5 = rv.iloc[max(0, t-5):t].mean()
        rv22 = rv.iloc[max(0, t-22):t].mean()
        x = np.array([1.0, rv1, rv5, rv22])
        b = np.array([params["const"], params["RV1"], params["RV5"], params["RV22"]])
        preds.append(float(np.dot(x, b)))
    return pd.Series(preds, index=idx[start_idx:], name="har_vol_pred")


NameError: name 'Dict' is not defined

## 🤖 LSTM — Inline Functions

In [None]:
# extended_models/models_lstm.py
from __future__ import annotations
from typing import Tuple
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

def make_univariate_sequences(series: pd.Series, seq_len: int) -> Tuple[np.ndarray, np.ndarray]:
    v = series.dropna().astype(float).values
    X, y = [], []
    for i in range(len(v) - seq_len):
        X.append(v[i:i+seq_len]); y.append(v[i+seq_len])
    X = np.asarray(X, dtype=float).reshape(-1, seq_len, 1)
    y = np.asarray(y, dtype=float)
    return X, y

def make_multivariate_sequences(X_df: pd.DataFrame, y: pd.Series, seq_len: int) -> Tuple[np.ndarray, np.ndarray]:
    X_df = X_df.astype(float); y = y.astype(float)
    common = X_df.index.intersection(y.index)
    X_df = X_df.loc[common]; y = y.loc[common]
    X_np, y_np = X_df.values, y.values
    n, n_feat = X_np.shape
    X_seq, y_seq = [], []
    for t in range(seq_len, n):
        X_seq.append(X_np[t-seq_len:t, :]); y_seq.append(y_np[t])
    return np.asarray(X_seq, float), np.asarray(y_seq, float)

def build_lstm_model(input_shape: Tuple[int, int], units: int = 64, dropout: float = 0.0) -> Sequential:
    model = Sequential()
    model.add(LSTM(units, input_shape=input_shape))
    if dropout > 0: model.add(Dropout(dropout))
    model.add(Dense(1))
    model.compile(optimizer="adam", loss="mse")
    return model

def train_lstm(model: Sequential, X_train: np.ndarray, y_train: np.ndarray, epochs: int = 50, batch_size: int = 32, validation_split: float = 0.1, patience: int = 5, verbose: int = 1):
    cb = [EarlyStopping(monitor="val_loss", patience=patience, restore_best_weights=True)] if validation_split and patience else []
    hist = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=cb, verbose=verbose, shuffle=False)
    return hist

def predict_lstm(model: Sequential, X_seq: np.ndarray) -> np.ndarray:
    return model.predict(X_seq, verbose=0).reshape(-1)


## 🧱 Assemble Unified DataFrame

In [None]:
coins

In [None]:
if USE_SYNTHETIC_DATA:
    coins = ['bitcoin','ethereum']
    O = cgpriceactiondaily(coins, DAYS_BACK, TIMEZONE, cg_headers=None)
else:
    coins = cg_universe(TOP_N, cg_headers=None)
    O = cgpriceactiondaily(coins, DAYS_BACK, TIMEZONE, cg_headers=None)

D = deribit_dvol_daily_multi(currencies=('BTC','ETH'), days= DAYS_BACK, timezone= TIMEZONE)
U = dune_metrics_daily()
M = fetch_fred_series_df({'vix_equity_vol':'VIXCLS'})  # example macro

# join on index
df = O.join([D, U, M], how='outer').sort_index().ffill().dropna()
print('Base df shape:', df.shape)
df.tail(3)


## 🧪 Compute Indicators & Build Feature Matrix

In [None]:
from apiwrappers.indicators_talib import compute_ta_indicators

ta_df = compute_ta_indicators(df, price_prefix='prices_')
df = df.join(ta_df).dropna()

# Target coin realized volatility
df[f'log_returns_{TARGET_COIN}'] = np.log(df[f'prices_{TARGET_COIN}']).diff()
df[f'realized_vol_{TARGET_COIN}'] = df[f'log_returns_{TARGET_COIN}'].abs()

# Build feature matrix as first differences
X_full = df.diff().dropna()
y_full = df[f'realized_vol_{TARGET_COIN}'].shift(-1).dropna()

# Align
common_idx = X_full.index.intersection(y_full.index)
X_full = X_full.loc[common_idx]
y_full = y_full.loc[common_idx]

# Split
n = len(X_full)
train_size = int(TRAIN_FRACTION * n)
X_train, X_test = X_full.iloc[:train_size], X_full.iloc[train_size:]
y_train, y_test = y_full.iloc[:train_size], y_full.iloc[train_size:]

X_full.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape


## 🪄 Naive Baseline

In [None]:
# Naive: predict tomorrow's vol as today's vol
y_naive = y_test.shift(1).fillna(method='bfill')
print('Naive MAE:', round(mae(y_test.values, y_naive.values), 6))


## 📊 GARCH(1,1)

In [None]:
from extended_models.models_garch_har import forecast_garch_rolling

returns = df[f'log_returns_{TARGET_COIN}'].dropna()
# Restrict to window used by X/y for fair alignment
returns = returns.loc[y_full.index.min():y_full.index.max()]
train_size_r = int(TRAIN_FRACTION * len(returns))

garch_preds = forecast_garch_rolling(returns, train_size=train_size_r, scale=GARCH_SCALE, refit_every=GARCH_REFIT_EVERY)
garch_preds = garch_preds.reindex(y_test.index).dropna()

garch_mae = mae(y_test.loc[garch_preds.index].values, garch_preds.values)
garch_mase = mase(y_test.loc[garch_preds.index].values, garch_preds.values, y_naive.loc[garch_preds.index].values)
print('GARCH MAE:', round(garch_mae,6), 'MASE:', round(garch_mase,6))
plot_pred_vs_actual(garch_preds.index, y_test.loc[garch_preds.index].values, garch_preds.values, 'GARCH(1,1)')


## 📈 HAR-RV

In [None]:
from extended_models.models_garch_har import fit_har_ols, forecast_har

rv = df[f'realized_vol_{TARGET_COIN}'].dropna()
rv = rv.loc[y_full.index.min():y_full.index.max()]

har_params = fit_har_ols(rv.iloc[:train_size])
har_preds = forecast_har(rv, har_params, start_idx=train_size).reindex(y_test.index).dropna()

har_mae = mae(y_test.loc[har_preds.index].values, har_preds.values)
har_mase = mase(y_test.loc[har_preds.index].values, har_preds.values, y_naive.loc[har_preds.index].values)
print('HAR MAE:', round(har_mae,6), 'MASE:', round(har_mase,6))
plot_pred_vs_actual(har_preds.index, y_test.loc[har_preds.index].values, har_preds.values, 'HAR-RV')


## 🤖 LSTM (Univariate on realized vol)

In [None]:
from extended_models.models_lstm import make_univariate_sequences, build_lstm_model, train_lstm, predict_lstm

rv_all = df[f'realized_vol_{TARGET_COIN}'].loc[y_full.index]
rv_train, rv_test = rv_all.iloc[:train_size], rv_all.iloc[train_size:]

X_seq_tr, y_seq_tr = make_univariate_sequences(rv_train, seq_len=LSTM_SEQ_LEN)
X_seq_te, y_seq_te = make_univariate_sequences(pd.concat([rv_train.iloc[-LSTM_SEQ_LEN:], rv_test]), seq_len=LSTM_SEQ_LEN)

model = build_lstm_model(input_shape=(LSTM_SEQ_LEN, 1), units=LSTM_UNITS, dropout=0.0)
_ = train_lstm(model, X_seq_tr, y_seq_tr, epochs=LSTM_EPOCHS, batch_size=LSTM_BATCH, validation_split=0.1, patience=5, verbose=0)

lstm_preds = predict_lstm(model, X_seq_te)
y_test_aligned = y_test.iloc[len(y_test) - len(lstm_preds):]

lstm_mae = mae(y_test_aligned.values, lstm_preds)
lstm_mase = mase(y_test_aligned.values, lstm_preds, y_naive.iloc[len(y_naive)-len(lstm_preds):].values)
print('LSTM MAE:', round(lstm_mae,6), 'MASE:', round(lstm_mase,6))
plot_pred_vs_actual(y_test_aligned.index, y_test_aligned.values, lstm_preds, 'LSTM (Univariate)')


## 🌲 (Optional) Tsfresh + XGBoost Baseline

In [None]:
if RUN_TSFRESH_XGB:
    try:
        import xgboost as xgb
        from tsfresh.feature_extraction import extract_features
        from tsfresh.utilities.dataframe_functions import make_forecasting_frame

        # Simple example: use realized vol series only for tsfresh demo
        series = df[f'realized_vol_{TARGET_COIN}'].dropna().loc[y_full.index]
        df_ts, y_ts = make_forecasting_frame(series, kind='rv', max_timeshift=7, rolling_direction=1)
        feats = extract_features(df_ts, column_id='id', column_sort='time', n_jobs=0)
        y_ts = y_ts.loc[feats.index]
        n_tr = int(TRAIN_FRACTION * len(y_ts))
        dtr = xgb.DMatrix(feats.iloc[:n_tr], label=y_ts.iloc[:n_tr])
        dte = xgb.DMatrix(feats.iloc[n_tr:], label=y_ts.iloc[n_tr:])
        params = {'objective':'reg:squarederror', 'max_depth':4, 'eta':0.1}
        bst = xgb.train(params, dtr, num_boost_round=200)
        pred = bst.predict(dte)
        print('XGB (tsfresh) MAE:', round(mae(y_ts.iloc[n_tr:].values, pred),6))
    except Exception as e:
        print('Skipped tsfresh+xgb demo:', e)


## 📋 Comparison

In [None]:
import pandas as pd
rows = []
rows.append({'Model':'Naive','MAE': mae(y_test.values, y_naive.values)})
if 'garch_mae' in globals(): rows.append({'Model':'GARCH(1,1)','MAE': garch_mae, 'MASE': garch_mase})
if 'har_mae' in globals(): rows.append({'Model':'HAR-RV','MAE': har_mae, 'MASE': har_mase})
if 'lstm_mae' in globals(): rows.append({'Model':'LSTM (uni)','MAE': lstm_mae, 'MASE': lstm_mase})
comp_df = pd.DataFrame(rows).set_index('Model')
comp_df
