Imports

In [None]:
# General Utilities 
import  random, os, pandas as pd, numpy as np
import matplotlib.pyplot as plt, datetime as dt
import dotenv, os, requests, time
import dask.dataframe as dd

# Environment & Dask Client
os.makedirs("OutputData", exist_ok=True)
dotenv.load_dotenv(dotenv.find_dotenv(filename=".env"))

# Dune Client
from dune_client.client import DuneClient
from dune_client.query import QueryBase

# My tsfresh rolled dataframe construction, feature extraction, selection, and XGBoost forecasting pipeline
import tsxg_pipeline as tsxg
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from tsfresh.convenience.bindings import dask_feature_extraction_on_chunk
from tsfresh.utilities.dataframe_functions import roll_time_series, impute
from tsfresh import extract_features, select_features, extract_relevant_features

# Dask Distributed Computing
from dask.distributed import Client, LocalCluster
from dask.distributed import progress

# Sklearn components
from sklearn.metrics import r2_score
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV

# Optuna XGB Dask Pipeline Module
import xgboost as xgb
from xgboost import dask as dxgb
import optuna
from optuna.integration.dask import DaskStorage

# Silence Redundant Warnings
import warnings

Key Constants and Configurations

In [None]:
# General
TARGET_COIN = "ethereum"
BASE_FIAT   = "usd"
TOP_N       = 10
LOOKBACK_DAYS = 365
START_DATE = (dt.datetime.now() - dt.timedelta(days=LOOKBACK_DAYS)).strftime("%Y-%m-%d")
TODAY= dt.date.today().strftime('%Y-%m-%d')
TIMEZONE = "Europe/Madrid"
plt.rcParams['figure.figsize'] = (20,8)
FREQUENCY = "1D"  
TIME= 31  # days for rolling window
SLEEP_TIME= 10  # seconds to wait between API calls to avoid rate limiting
# Silence Redundant Warnings
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# --- Dune API configuration ---
DUNE_QUERIES = {
    "economic_security": 1933076,   # Blockchain security metric aggregating hash rate, staking amounts, ...
    "daily_dex_volume": 4388,       # Daily trading volume across decentralized exchanges
    "btc_etf_flows": 5795477,       # Capital inflows/outflows for Bitcoin ETF products
    "eth_etf_flows": 5795645,       # Capital inflows/outflows for Ethereum ETF products
    "total_defi_users": 2972,       # Unique addresses interacting with DeFi protocols
    "median_gas": 2981260,          # Median gas price (transaction fees) on Ethereum
}
DUNE_API_KEY = os.getenv("DUNE_API_KEY")
DUNE_CSV_PATH = "OutputData/Dune_Metrics.csv"

# --- FRED API configuration ---
FRED_API_KEY= os.getenv("FRED_API_KEY")
FRED_KNOWN = {
    "VIXCLS":   "vix_equity_vol",            # CBOE VIX (Equity market volatility index)
    "MOVE":     "move_bond_vol",             # ICE BofA MOVE Index (Bond market volatility)
    "OVXCLS":   "ovx_oil_vol",               # CBOE Crude Oil Volatility Index (Oil market volatility)
    "GVZCLS":   "gvz_gold_vol",              # CBOE Gold Volatility Index (Gold market volatility)
    "DTWEXBGS": "usd_trade_weighted_index",  # Trade-Weighted U.S. Dollar Index (Broad Goods)
    "DGS2":     "us_2y_treasury_yield",      # U.S. 2-Year Treasury Yield (constant maturity)
    "DGS10":    "us_10y_treasury_yield",     # U.S. 10-Year Treasury Yield (constant maturity)
}
# --- Optuna XGBoost Pipeline Configuration ---
SPLITS = 5  # Time series folds
DEFAULT_XGB_METRIC = 'mae'  # For XGBoost/Optuna
DEFAULT_TREE_METHOD = 'hist'
DEFAULT_EARLY_STOPPING = 25
DEFAULT_N_TRIALS = 100
DEFAULT_N_ROUNDS = 200

# --- TSXG Pipeline configuration ---
RANDOM_SEED = 42
EXTRACTION_SETTINGS = EfficientFCParameters()




In [None]:
# CPU optimized cluster for my CPU 
try: 
    client.close()
    cluster.close()
    cluster = LocalCluster(
            n_workers=4,
            threads_per_worker=5,
            processes = True,
            dashboard_address=':8787',
            resources = {'GPU':2}
        )
    client = Client(cluster)
    client
except:
    cluster = LocalCluster(
                n_workers=4,
                threads_per_worker=5,
                processes = True,
                dashboard_address=':8787',
                resources = {'GPU':2}
            )
    client = Client(cluster)
    client

Modules

In [None]:
# API Modules
# Expects these globals to be defined by the notebook:
# TIMEZONE, DAYS_BACK, CG_TOP_N, CG_HEADERS,
# DUNE_CSV_PATH, FRED_API_KEY (env), FRED_KNOWN, DUNE_QUERIES, DUNE_API_KEY (env), 
# TARGET_COIN, BASE_FIAT, FREQUENCY, LOOKBACK_DAYS, START_DATE, TODAY, TOP_N, 

# --- CoinGecko Investment Universe (V1)  ---
def CoinGecko_GetUniverse(n, cg_api_key=os.getenv("COINGECKO_API_KEY"), sleep_time=6):
    """
        Top n cryptocurrency IDs from CoinGecko API sorted by market cap.
            Parameters:
            - n: Number of top coins to retrieve
            - cg_api_key: CoinGecko API key
        Returns: n
            - numpy array of identifiers or dictionary containing both formats
        """
    if cg_api_key is None: cg_api_key = os.getenv("COINGECKO_API_KEY")
    cg_headers = {
        "accept": "application/json",
        "x_cg_demo_api_key": cg_api_key
        }
    url = "https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd"
    js = requests.get(url, headers=cg_headers).json()
    df = pd.DataFrame(js)
    time.sleep(sleep_time)
    try:
        return  df.head(n)['id'].values 
    except: 
        return print("Error Getting Coin Id's: ", df.loc['error_message'].values)

# --- CoinGecko Investment Universe (V2) Returns tickers e.g. ETH, ids e.g. ethereum, or both ---
def CoinGecko_GetUniverseV2(n=TOP_N, output_format="ids", 
                            cg_api_key=os.getenv("COINGECKO_API_KEY"), sleep_time=6):
    """
    Top n cryptocurrency tickers and/or ids from CoinGecko API by market cap.
        Parameters:
        - n: Number of top coins to retrieve
        - output_format: Format of identifiers to return
        * "ids": CoinGecko IDs (e.g., "bitcoin", "ethereum")
        * "symbols": Ticker symbols (e.g., "BTC", "ETH") for use with Binance API
        * "both": Returns a dict containing both formats
        - cg_api_key: CoinGecko API key
        Returns: 
        - numpy array of identifiers or dictionary containing both formats
    """
    if cg_api_key is None: print("No API Key Available")
    cg_headers = {
        "accept": "application/json",
        "x_cg_demo_api_key": cg_api_key
    }
    url = "https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd&order=market_cap_desc"
    js = requests.get(url, headers=cg_headers).json()
    df = pd.DataFrame(js)
    time.sleep(sleep_time)
    try:
        if output_format == "ids":
            result = df.head(n)['id'].values
            print(f"Retrieved {len(result)} coin IDs by market cap from CoinGecko")
            return result
        elif output_format == "symbols":
            result = df.head(n)['symbol'].str.upper().values
            print(f"Retrieved {len(result)} coin symbols by market cap from CoinGecko")
            return result
        elif output_format == "both":
            ids = df.head(n)['id'].values
            symbols = df.head(n)['symbol'].str.upper().values
            print(f"Retrieved {len(ids)} coins by market cap from CoinGecko")
            return {"ids": ids, "ticker": symbols}
        else:
            raise ValueError("output_format must be 'ids', 'symbols', or 'both'")
    except: 
        return print("Error Getting Coin Id's: ", df.loc['error_message'].values)

# --- CoinGecko Price Data ---
def CoinGecko_GetPriceAction(coins, start= START_DATE, 
                             tz=TIMEZONE, cg_api_key=os.getenv("COINGECKO_API_KEY"), freq=FREQUENCY, sleep_time=6):
    """"
    Only works up to past 365 days, lose intraday data if > 90 days due to API public demo limits.
    For longer history, use Binance_GetPriceData below.
    """
    end_timestamp   = int(dt.datetime.now().timestamp()) * 1000
    start_timestamp = int(pd.to_datetime(start).timestamp()) * 1000
    cg_headers = {
    "accept": "application/json",
    "x_cg_demo_api_key": cg_api_key
    }
    outbig=None
    for c in coins:
        try:
            url = f"https://api.coingecko.com/api/v3/coins/{c}/market_chart/range?vs_currency=usd&from={start_timestamp}&to={end_timestamp}"
            js = requests.get(url, headers=cg_headers).json()
            outsmall = None
            for column in js:
                timestamps = pd.to_datetime([x[0]for x in js[column]], unit='ms').tz_localize(TIMEZONE)
                values= [x[1] for x in js[column]]
                if outsmall is None: outsmall= pd.DataFrame(data= values, columns= [(column+'_'+c)], index= timestamps)
                else: outsmall[(column+'_'+c)] = values
            outsmall[['prices_'+c, 'market_caps_'+c, 'total_volumes_'+c]] = outsmall[['prices_'+c, 'market_caps_'+c, 'total_volumes_'+c]].apply(pd.to_numeric, errors='coerce')
            outsmall.index.name = 'date'
            pricesandmc= outsmall[['prices_'+c, 'market_caps_'+c]].resample(freq).last().dropna()
            volumes= outsmall[['total_volumes_'+c]].resample(freq).sum().dropna()
            outsmall= pricesandmc.join(volumes, how='inner')
            time.sleep(sleep_time)
            if outbig is None: outbig= outsmall
            else: outbig= outbig.join(outsmall, how='inner')
        except Exception as e:
            print(f"Error processing data for {c}: {e}")
            continue
            time.sleep(sleep_time)
    return outbig

# --- CoinGecko OHLC Data ---
def CoinGecko_GetOHLC(coins, days=LOOKBACK_DAYS, vs_currency="usd"):
    """
    Returns a DataFrame with daily open, high, low, close for each coin in coins.
    """
    out = None
    for coin in coins:
        try:
            url = f"https://api.coingecko.com/api/v3/coins/{coin}/ohlc?vs_currency={vs_currency}&days={days}"
            # CoinGecko returns [timestamp, open, high, low, close] in ms, daily
            js = requests.get(url).json()
            if not js or not isinstance(js, list):
                print(f"No OHLC data for {coin}")
                continue
            df = pd.DataFrame(js, columns=["ts", f"open_{coin}", f"high_{coin}", f"low_{coin}", f"close_{coin}"])
            df["date"] = pd.to_datetime(df["ts"], unit="ms").dt.date
            df = df.drop(columns=["ts"]).set_index("date")
            if out is None:
                out = df
            else:
                out = out.join(df, how="outer")
        except Exception as e:
            print(f"Error fetching OHLC for {coin}: {e}")
            continue
    return out

# --- CoinGecko Extended Historical Data (with pagination) ---
def CoinGecko_GetHistoricalData_Paginated(coin_ids, vs_currency="usd", max_days=365, 
                                          step_days=90, timezone=TIMEZONE, 
                                          cg_api_key=os.getenv("COINGECKO_API_KEY")):
    """
    Gets extended historical price data from CoinGecko using pagination.
    Parameters:
        coin_id: CoinGecko coin ID (e.g., 'bitcoin')
        vs_currency: Base currency (e.g., 'usd')
        max_days: Maximum days to fetch
        step_days: Days per request (smaller = more requests but more granular data)
        timezone: Timezone for the returned DataFrame index
        cg_api_key: CoinGecko API key
        
    Returns:
        DataFrame with prices, market caps and volumes with datetime index
    """
    output=None
    for coin_id in coin_ids:
        full_prices = []
        full_market_caps = []
        full_volumes = []
        cg_headers = {"accept": "application/json", "x_cg_demo_api_key": cg_api_key}
        # Start from today and work backwards
        end_date = dt.datetime.now()
        current_end = int(end_date.timestamp())
        target_start_date = end_date - dt.timedelta(days=max_days)
        print(f"Fetching data for {coin_id} from {end_date.date()} back to {target_start_date.date()}")
        api_requests = 0
        while True:
            # Calculate window
            current_start = int((end_date - dt.timedelta(days=step_days)).timestamp())
            # Build request
            url = f"https://api.coingecko.com/api/v3/coins/{coin_id}/market_chart/range"
            params = {
                "vs_currency": vs_currency,
                "from": current_start,
                "to": current_end
            }
            # Make request
            response = requests.get(url, headers=cg_headers, params=params)
            data = response.json()
            api_requests += 1
            if 'prices' not in data:
                print(f"No more data available or error after {api_requests} requests")
                if 'error' in data:
                    print(f"Error: {data['error']}")
                break
            # Process data - extract timestamps and values
            prices = data.get('prices', [])
            market_caps = data.get('market_caps', [])
            volumes = data.get('total_volumes', [])
            if not prices:
                break
            # Add to collections (older data gets added at the beginning)
            full_prices = prices + full_prices
            full_market_caps = market_caps + full_market_caps
            full_volumes = volumes + full_volumes
            print(f"Request #{api_requests}: Got {len(prices)} price points")
            # Move window back in time
            end_date = dt.datetime.fromtimestamp(current_start)
            current_end = current_start - 1
            # Check if we've gone far enough
            if end_date <= target_start_date:
                print(f"Reached target date")
                break
            # Respect CoinGecko's rate limits
            time.sleep(3)
        # Create DataFrame from collected data
        if not full_prices:
            print("No data collected")
            return pd.DataFrame()
        # Create individual DataFrames for each data type
        df_prices = pd.DataFrame(full_prices, columns=['timestamp', f'prices_{coin_id}'])
        df_prices['timestamp'] = pd.to_datetime(df_prices['timestamp'], unit='ms')
        df_mcaps = pd.DataFrame(full_market_caps, columns=['timestamp', f'market_caps_{coin_id}'])
        df_mcaps['timestamp'] = pd.to_datetime(df_mcaps['timestamp'], unit='ms')
        df_volumes = pd.DataFrame(full_volumes, columns=['timestamp', f'total_volumes_{coin_id}'])
        df_volumes['timestamp'] = pd.to_datetime(df_volumes['timestamp'], unit='ms')
        # Merge the dataframes
        df = df_prices.merge(df_mcaps, on='timestamp', how='outer')
        df = df.merge(df_volumes, on='timestamp', how='outer')
        # Set index and timezone
        df = df.set_index('timestamp')
        if timezone:
            df.index = df.index.tz_localize(timezone)
        df.index.name = 'date'
        print(f"Total data points: {len(df)}")
        print(f"Data ranges from {df.index.min().date()} to {df.index.max().date()}")
        if output is None:
            output = df
        else:
            output = output.join(df, how='outer')
    return output
# --- Deribit DVOL ---
def Deribit_GetDVOL(currencies, days, timezone, resolution="1D"):
    out = None
    end   = int(dt.datetime.now().timestamp()) * 1000
    start = int((dt.datetime.now() - dt.timedelta(days=days)).timestamp()) * 1000
    count=0
    for cur in currencies:
        js = requests.post(
            "https://www.deribit.com/api/v2/",
            json={"method": "public/get_volatility_index_data",
                    "params": {"currency": cur, "resolution": resolution,
                                "end_timestamp": end, "start_timestamp": start}}
        ).json()
        data = js.get("result", {}).get("data", [])
        if not data:
            continue
        d = pd.DataFrame(data, columns=["t","open","high","low","dvol"])
        d["t"] = pd.to_datetime(d["t"], unit="ms")
        df = d.set_index("t")[["dvol"]].rename(columns={"dvol": f"dvol_{cur.lower()}"})
        df.index = df.index.tz_localize('Europe/Madrid')
        df = df.resample("1D").last().dropna(how="any")
        df.index.name = "date"
        if count ==0: out = df
        else: out = out.join(df, how='inner')
        count= count+1
    return out

# --- Dune (CSV) ---    
def Dune_FromCSV(path, timezone):
    if not os.path.exists(path):
        return pd.DataFrame()
    df = pd.read_csv(path, index_col=None)
    dt_col = None
    for c in df.columns:
        try:
            pd.to_datetime(df[c], errors="raise")
            dt_col = c
            break
        except Exception:
            continue
    if dt_col is None and "date" in df.columns:
        dt_col = "date"
    if dt_col is None:
        return pd.DataFrame()
    df = df.rename(columns={dt_col: "date"})
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.set_index("date")
    df.index = df.index.tz_localize(timezone)
    df.columns = [c.lower() for c in df.columns]
    df.index.name = "date"
    df = df.resample("1D").last().dropna(how="any")
    return df

# --- Dune ---
def Dune_GetQueries(query_ids, timezone, dune_api_key=None):
    dune = DuneClient(api_key=dune_api_key or os.environ.get("DUNE_API_KEY"),
                       base_url="https://api.dune.com")
    out = None
    for qid in query_ids:
        try:
            q = QueryBase(query_id=qid)
            df = dune.run_query_dataframe(query=q, ping_frequency=2, batch_size=365)
            ok = False
            for col in list(df.columns):
                try:
                    pd.to_datetime(df[col], errors="raise")
                    df = df.rename(columns={col: "date"}).set_index("date")
                    ok = True
                    break
                except:
                    continue
            if not ok and not isinstance(df.index, pd.DatetimeIndex):
                continue
            if isinstance(df.index, pd.DatetimeIndex):
                df.index = df.index.tz_localize(timezone)
            df.columns = [c.lower() for c in df.columns]
            df.index.name = "date"
            df = df.resample("1D").last().dropna(how="any")
            out = df if out is None else out.join(df, how="inner")
        except:
            continue
    return out if out is not None else print('Error Fetching Dune Queries')

# --- FRED ---
def Fred_GetSeries(series_ids= FRED_KNOWN, start=START_DATE, timezone=TIMEZONE, fred_api_key=FRED_API_KEY):
    key = fred_api_key or os.getenv("FRED_API_KEY")
    if not key:
        return print("No API Key Available")
    base = "https://api.stlouisfed.org/fred/series/observations"
    df= None
    for sid in series_ids:
        try:
            js = requests.get(base, params={
                    "series_id": sid, "api_key": fred_api_key, "file_type": "json",
                    "observation_start": start
                }).json()
            obs= pd.DataFrame(js['observations'])
            index = pd.DatetimeIndex(obs['date'], freq='infer', tz=timezone)
            obs = obs.set_index(index)['value'].rename(FRED_KNOWN[sid])
            obs= pd.to_numeric(obs, errors='coerce')
            if df is not None: df= pd.merge(left= df, right=obs, left_index=True, right_index=True)
            else: df = obs
        except:
            print("error fetching:", series_ids[sid])
            continue
        time.sleep(2)
    if df is not None:  return df.asfreq('D', method='ffill')
    else: return print('Error Compiling Data')

# --- Binance Price Action ---
def Binance_GetPriceAction(ids=None, tickers=None,  interval="1d", max_days=365, timezone=TIMEZONE, top_n=TOP_N):
    """
    Gets extended OHLCV data from Binance using pagination to overcome the 1000 candle limit.
    Parameters:
        symbols: List of trading symbols (e.g., ['BTC', 'ETH'])
        interval: Candlestick interval (1m, 3m, 5m, 15m, 30m, 1h, 2h, 4h, 6h, 8h, 12h, 1d, 3d, 1w, 1M)
        max_days: Maximum number of days of history to fetch
        timezone: Timezone for the returned DataFrame index
    Returns:
        DataFrame with OHLCV data and datetime index
    """
    outbig = None
    if ids is None or tickers is None:
        ids, tickers = CoinGecko_GetUniverseV2(n=top_n, output_format="both", 
                            cg_api_key=os.getenv("COINGECKO_API_KEY")).values()
    for id, ticker in zip(ids, tickers):
        ticker = ticker.upper()
        print(f"Fetching {interval} candles for {id} going back {max_days} days...")
        # Pagination variables
        full_data = []
        end_time = int(dt.datetime.now().timestamp() * 1000)  
        start_date_target = dt.datetime.now() - dt.timedelta(days=max_days)  
        api_requests = 0
        while True:
            url = "https://api.binance.com/api/v3/klines"
            params = {
                "symbol": ticker + "USDT",
                "interval": interval,   
                "endTime": end_time,
                "limit": 1000
            }
            response = requests.get(url, params=params)
            data = response.json()
            api_requests += 1
            if not data or len(data) == 0 or (isinstance(data, dict) and 'code' in data):
                print(f"No more data available for {id} after {api_requests} requests")
                break
            print(f"Request #{api_requests}: Got {len(data)} candles for {id}")
            full_data = data + full_data
            oldest_timestamp = int(data[0][0])
            oldest_date = dt.datetime.fromtimestamp(oldest_timestamp/1000)
            if oldest_date <= start_date_target:
                print(f"Reached target date ({start_date_target.date()}) for {id}")
                break
            end_time = oldest_timestamp - 1
            time.sleep(1)
        if not full_data:
            print(f"No data collected for {id}")
            continue
        df = pd.DataFrame(full_data, columns=[
            'timestamp', 'open', 'high', 'low', 'close', 'volume',
            'close_time', 'quote_asset_volume', 'number_of_trades',
            'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore'
        ])
        for col in ['open', 'high', 'low', 'close', 'volume']:
            df[col] = pd.to_numeric(df[col])
            df[col + '_' + id.lower()] = df[col]  
        df['date'] = pd.to_datetime(df['timestamp'], unit='ms', errors='coerce', utc=True)
        df = df.set_index('date').tz_convert(timezone)
        symbol_cols = [f"{col}_{id}" for col in ['open', 'high', 'low', 'close', 'volume']]
        df = df[symbol_cols]
        print(f"Total candles collected for {id}: {len(df)}")
        print(f"Data ranges from {df.index.min().date()} to {df.index.max().date()}")
        if outbig is None:
            outbig = df
        else:
            outbig = outbig.join(df, how='outer')
    if outbig is None:
        print("No data collected for any symbols.")
        return pd.DataFrame()
    outbig = outbig.sort_index()
    outbig.index.name = 'date'
    print(f"Combined data has {len(outbig)} rows")
    return outbig



In [None]:
# Ta-Lib Technical Analysis Indicators
import talib
def Compute_TAIndicators(df, price_prefix="prices_", rsi_period=14,
                          macd_fast=12, macd_slow=26, macd_signal=9,
                          sma_windows=(10,20,50), ema_windows=(10,20,50)):
    out = pd.DataFrame(index=df.index)
    price_cols = [c for c in df.columns if c.startswith(price_prefix)]
    coins = [c[len(price_prefix):] for c in price_cols]
    for coin in coins:
        try:
            p = df[f"{price_prefix}{coin}"]
            out[f"rsi{rsi_period}{coin}"] = talib.RSI(p.values, timeperiod=rsi_period)
            macd, macd_sig, macd_hist = talib.MACD(p.values, fastperiod=macd_fast, slowperiod=macd_slow, signalperiod=macd_signal)
            out[f"macd_{coin}"] = macd; out[f"macd_signal_{coin}"] = macd_sig; out[f"macd_hist_{coin}"] = macd_hist
            for w in sma_windows: out[f"sma{w}_{coin}"] = talib.SMA(p.values, timeperiod=w)
            for w in ema_windows: out[f"ema{w}_{coin}"] = talib.EMA(p.values, timeperiod=w)
            out[f"bb_upper_{coin}"], out[f"bb_middle_{coin}"], out[f"bb_lower_{coin}"] = talib.BBANDS(p.values)
            out[f"atr_{coin}"] = talib.ATR(df[f"high_{coin}"], df[f"low_{coin}"], p.values)
            out[f"adx_{coin}"] = talib.ADX(df[f"high_{coin}"], df[f"low_{coin}"], p.values)
            out[f"stoch_k_{coin}"], out[f"stoch_d_{coin}"] = talib.STOCH(df[f"high_{coin}"], df[f"low_{coin}"], p.values)
            out[f"cci_{coin}"] = talib.CCI(df[f"high_{coin}"], df[f"low_{coin}"], p.values)
            out[f"willr_{coin}"] = talib.WILLR(df[f"high_{coin}"], df[f"low_{coin}"], p.values)
            out[f"mom_{coin}"] = talib.MOM(p.values)
            out[f"roc_{coin}"] = talib.ROC(p.values)
            out[f"obv_{coin}"] = talib.OBV(p.values, df[f"volume_{coin}"])
            out[f"mfi_{acoin}"] = talib.MFI(df[f"high_{coin}"], df[f"low_{coin}"], p.values, df[f"volume_{coin}"])
        except: continue    
    out.index = df.index
    return out


In [None]:
# TSFresh/Dask Modules
def roll_dask(df):
    if len(df) == 0:
        return pd.DataFrame()
    print(f"Processing partition with columns: {df.columns.tolist()}")
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    rolled = roll_time_series(
        df,
        column_id='variable',
        column_sort='date',
        max_timeshift=TIME,
        min_timeshift=1,
        rolling_direction=1,
        n_jobs=1
    )
    return rolled

def extract_dask(df):
    df = df.copy().dropna()
    if len(df) == 0:
        return pd.DataFrame()
    print(f"Extracting features for partition with columns: {df.columns.tolist()}")
    features = extract_features(
        df,
        column_id='id',
        column_sort='date',
        column_kind='variable',
        column_value='value',
        default_fc_parameters=EXTRACTION_SETTINGS,
        n_jobs=1
    )
    return features

def select_dask(df, y):
    df= df.reset_index(level=0, drop=True).join(y, how='inner').dropna()
    if len(df) == 0:
        return pd.DataFrame()
    features= select_features(
        df.drop('target', axis=1),
        df['target'],
        ml_task='regression',
        fdr_level=0.05,
        hypotheses_independent=False,  
        n_jobs=1
    )
    return features

In [None]:
# Optuna XGBoost Dask Pipeline Module
def Optuna_XGB_Dask(client, dtrain, 
                    n_trials=DEFAULT_N_TRIALS, 
                    n_rounds=DEFAULT_N_ROUNDS, 
                    eval_metric=DEFAULT_XGB_METRIC,
                    tree_method=DEFAULT_TREE_METHOD, 
                    early_stopping_rounds=DEFAULT_EARLY_STOPPING):
    """XGBoost optimization with Optuna using DaskArgs:
        dtrain: Dask DMatrix (already created with client)
        n_trials: Number of optimization trials
        eval_metric: Evaluation metric ('mae', 'rmse', etc.)
        tree_method: XGBoost tree construction algorithm
        early_stopping_rounds: Number of rounds for early stopping
    Returns:
        optuna.Study: Optimization study results
    """    
    def objective(trial):
        param_grid = {
        "verbosity": 1,
        "num_boost_rounds": trial.suggest_int("num_boost_rounds", 100, 1000),
        "tree_method": DEFAULT_TREE_METHOD,
        "eval_metric": DEFAULT_XGB_METRIC,
        "lambda": trial.suggest_float("lambda", 0.01, 10.0, log=True),  
        "alpha": trial.suggest_float("alpha", 0.01, 10.0, log=True),    
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),  
        "max_depth": trial.suggest_int("max_depth", 3, 12), 
        "min_child_weight": trial.suggest_float("min_child_weight", 0.1, 10, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3, log=True),
        "gamma": trial.suggest_float("gamma", 0.0, 1.0),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0)
    }
            
        output = dxgb.train(
            client,
            param_grid,
            dtrain,
            # num_boost_round=param_grid["num_boost_rounds"],
            early_stopping_rounds=early_stopping_rounds,
            evals=[(dtrain, "train")]
        )
        return output["history"]["train"][eval_metric][-1]

    # Create study with parallel optimization
    storage = DaskStorage()
    study = optuna.create_study(direction="minimize", storage= storage)
    study.optimize(
        objective, 
        n_trials=n_trials,
        n_jobs=20,  # Use all available cores
        gc_after_trial=True,  # Clean memory after each trial
        show_progress_bar=True
        )
    
    return study

Data Collection from Various APIs


In [None]:
[ids, tickers] = CoinGecko_GetUniverseV2(TOP_N, output_format="both").values()
price_action1 = Binance_GetPriceAction(ids=ids,tickers=tickers, interval="1d", max_days=LOOKBACK_DAYS, timezone=TIMEZONE)
price_action2 = CoinGecko_GetPriceAction(ids, start=START_DATE, tz=TIMEZONE, freq='D')
dvol = Deribit_GetDVOL(['BTC','ETH'], days=LOOKBACK_DAYS, timezone=TIMEZONE)
# onchainanalytics = dune_metrics_daily(DUNE_QUERIES, DUNE_API_KEY) 
onchainanalytics = Dune_FromCSV(path= DUNE_CSV_PATH, timezone=TIMEZONE)  
macrodata= Fred_GetSeries(series_ids= FRED_KNOWN, fred_api_key=FRED_API_KEY, start=START_DATE, timezone=TIMEZONE)
price_action1, price_action2, dvol, onchainanalytics, macrodata

Combine all Data Sources into one Dataframe

In [None]:
Unified = None
for df in [price_action1, dvol, onchainanalytics, macrodata, price_action2]:
    try: df.index = pd.DatetimeIndex(df.index).tz_localize(TIMEZONE).date
    except: df.index = pd.DatetimeIndex(df.index).tz_convert(TIMEZONE).date
    if Unified is None: Unified = df
    else: Unified = Unified.join(df, how='outer')
Unified.tail(365)

Feature Container Construction and Target Definition

In [None]:
X= Unified.iloc[-365 : ].dropna(axis=1, thresh=int(0.1*len(Unified))).ffill(limit=3)
X[f'log_returns_{TARGET_COIN}']= np.log(X[f'prices_{TARGET_COIN}']) - np.log(X[f'prices_{TARGET_COIN}'].shift(1))
X[f'realized_vol_{TARGET_COIN}'] = abs(X[f'log_returns_{TARGET_COIN}'])
X= X.diff().dropna()
y = X[f'realized_vol_{TARGET_COIN}'].shift(-1).dropna().rename("target")
X.rename_axis(index='date', inplace=True)
y.rename_axis(index='date', inplace=True)
X, y

Feature Engineering: techincal analysis indicators

In [None]:
taindicators= Compute_TAIndicators(X, price_prefix="prices_")
X = X.join(taindicators, how='left').dropna()
X= X.loc[X.join(y, how='inner').dropna().index]
y= y.loc[X.index]   
taindicators, X, y

Dask Feature Container Construction & Prep for Tsfresh, Xgboost, Optuna Pipeline: horizontal pandas.dataframe -> melted/stacked dask.dataframe

In [None]:
FC = X.reset_index().melt(id_vars=['date']).sort_values(by='variable')
npartitions= FC.variable.nunique()
FC_dask = dd.from_pandas(FC, npartitions= npartitions)
# Check one timeseries per partition
print(FC_dask.map_partitions(lambda df: df['variable'].nunique()).compute().unique())
FC

TSfresh & Dask Feature Engineering & Selection Pipeline: timeseries rolling -> feature extraction -> feature selection

In [None]:
# TSFresh/Dask Execution
# Test rolling on one partition for metadata
df_test= FC_dask.partitions[0].compute()
df_test['date'] = pd.to_datetime(df_test['date'])
rolled_test = roll_time_series(
    df_test,
    column_id='variable',
    column_sort='date',
    max_timeshift=TIME,
    min_timeshift=1,
)
# Rolling - No persist (fast operation)
rolled_dask = FC_dask.map_partitions(roll_dask, meta=rolled_test).persist()
# Feature extraction - Persist (expensive step)
features_dask = rolled_dask.map_partitions(extract_dask, enforce_metadata=False).persist()
# Feature selection - Compute directly (result)
selected_dask = features_dask.map_partitions(select_dask, y=y, enforce_metadata=False).persist()
# Materialize and join results
out = None
selected_futures = client.compute(selected_dask.to_delayed())
for i, future in enumerate(selected_futures):
    df = future.result()  # Get result for this partition
    if len(df) > 0:
        if out is None:
            out = df
        else:
            out = out.join(df, how='outer')
out

Construct Final OptunaXGBDaskPipeline-ready Dmatrix with Test-Train Split and Unified Feature Container (tsfresh features + relevant base timeseries)

In [None]:
base_selected = select_features(X, y, fdr_level=0.05, ml_task='regression', hypotheses_independent=False)
final_features = out.join(base_selected, how='left')
final_features_dask = dd.from_pandas(final_features.join(y, how='left'), npartitions=SPLITS, sort=True)
X = final_features_dask.drop('target', axis=1)
y = final_features_dask['target']
X_train= X.partitions[0:-1]
X_test= X.partitions[-1]
y_train= y.partitions[0:-1]
y_test= y.partitions[-1]
X_train, X_test, y_train, y_test = client.persist([X_train, X_test, y_train, y_test])
dtrain= dxgb.DaskDMatrix(client, X_train, y_train)
final_features, dtrain

Run Optuna Study -> Build Model With Optimal Hyperparameters -> Run Model Predictions

In [None]:
study= Optuna_XGB_Dask(client, dtrain, n_trials=100, n_rounds=100, eval_metric= 'mae', tree_method='hist', early_stopping_rounds=20)
final_model= dxgb.train(client, study.best_params, dtrain, num_boost_round=100, evals=[(dtrain, "train")])
model_features = final_model['booster'].feature_names
dtest= dxgb.DaskDMatrix(client, X_test[model_features])
predictions = dxgb.predict(client, final_model, dtest)

Model Results: evaluation, & visualization 

In [None]:
y_test_pd = y_test.compute()
predictions_pd = pd.Series(predictions.compute(), index=y_test_pd.index)

# Calculate metrics
r2 = r2_score(y_true=y_test_pd, y_pred=predictions_pd)
mae = np.mean(np.abs(y_test_pd - predictions_pd))
std = y_test_pd.std()
thresh_var = mae/std 

# Calculate MASE (Mean Absolute Scaled Error)
naive_forecast = y_test_pd.shift(1)
mae_naive = np.mean(np.abs(y_test_pd[1:] - naive_forecast[1:]))
mase = mae / mae_naive if mae_naive != 0 else np.nan

# Print metrics
print(f'Standard Deviation: {std:.6f}')
print(f"Best parameters: {study.best_params}")
print(f"Best MAE: {study.best_value:.6f}")
print(f"R2 Score: {r2:.6f}")
print(f'MAE/StdDev: {thresh_var:.6f}')
print(f'MASE: {mase:.6f}')

# Visualize results
viz = pd.DataFrame({'Actual': y_test_pd, 'Predicted': predictions_pd})
plt.figure(figsize=(20, 10))

# Plot time series
plt.subplot(2, 1, 1)
viz.plot(ax=plt.gca())
plt.title(f'Predicted vs Actual Realized Volatility for {TARGET_COIN}')
plt.ylabel('Realized Volatility')

# Plot scatter with perfect prediction line
plt.subplot(2, 1, 2)
plt.scatter(viz['Actual'], viz['Predicted'], alpha=0.5)
max_val = max(viz['Actual'].max(), viz['Predicted'].max())
min_val = min(viz['Actual'].min(), viz['Predicted'].min())
plt.plot([min_val, max_val], [min_val, max_val], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Predicted vs Actual (Perfect prediction = red line)')
plt.tight_layout()
plt.show()