## 🚀 Setup Instructions

**Important: Run this notebook from the `notebooks/` directory after:**

1. **Installing the package:** `pip install -e .` (from project root)
2. **Activating virtual environment:** `venv\Scripts\activate`
3. **Configuring API keys:** Copy `.env.example` to `.env` and add your keys
4. **Starting Jupyter:** `cd notebooks && jupyter lab`

# Cryptocurrency Volatility Forecasting - Main Pipeline

This notebook implements the complete cryptocurrency volatility forecasting pipeline, consolidating the proven methodology from the original research notebook into a production-ready workflow.

## Pipeline Architecture:
1. **Data Collection**: Multi-source cryptocurrency and macroeconomic data aggregation
2. **Feature Engineering**: TSFresh automated feature extraction with technical analysis indicators
3. **Distributed Processing**: Dask-optimized computation for large-scale time series operations
4. **Model Training**: XGBoost implementation with Optuna hyperparameter optimization
5. **Performance Evaluation**: Comprehensive model assessment using multiple metrics

## Environment Setup and Module Imports

## Configuration Parameters

In [1]:
# Configuration parameters - modify these for your analysis
TARGET_COIN = "ethereum"      # Main coin to forecast (bitcoin, ethereum, etc.)
TOP_N = 10                   # Number of top cryptocurrencies to include
LOOKBACK_DAYS = 365         # Historical data period in days
FREQUENCY = "1D"            # Data frequency: "1D" for daily, "1H" for hourly
TIMEZONE = "Europe/Madrid"  # Timezone for data collection

# Machine learning parameters
N_TRIALS = 50               # Number of Optuna optimization trials
N_ROUNDS = 200              # XGBoost training rounds
EVAL_METRIC = 'mae'         # Evaluation metric for optimization
RANDOM_SEED = 42

print(f"Configuration:")
print(f"  Target Coin: {TARGET_COIN}")
print(f"  Data Frequency: {FREQUENCY}")
print(f"  Lookback Period: {LOOKBACK_DAYS} days")
print(f"  Universe Size: Top {TOP_N} cryptocurrencies")
print(f"  ML Trials: {N_TRIALS}")

Configuration:
  Target Coin: ethereum
  Data Frequency: 1D
  Lookback Period: 365 days
  Universe Size: Top 10 cryptocurrencies
  ML Trials: 50


In [2]:
# Standard library imports
import sys
import random, os, pandas as pd, numpy as np
import matplotlib.pyplot as plt, datetime as dt
import warnings
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# Change to repository root and add src to path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
repo_root = os.path.dirname(notebook_dir)
os.chdir(repo_root)
sys.path.append('src')

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Configure environment
os.makedirs("OutputData", exist_ok=True)
plt.rcParams['figure.figsize'] = (20, 8)

# Import toolkit modules
from data.collectors import CryptoDataCollector
from features.engineering import CryptoFeatureEngineer
from models.pipeline import CryptoVolatilityMLPipeline
from utils.dask_helpers import create_optimized_dask_client, cleanup_dask_client

print("Module imports completed successfully")
print(f"Working directory: {os.getcwd()}")
print(f"API keys loaded from environment variables")

2025-10-02 15:11:18,982 INFO numba.cuda.cudadrv.driver init


Optuna not available.
Module imports completed successfully
Working directory: c:\CryptoMarketForecasting-new\v2-volatility-forecasting
API keys loaded from environment variables


## Distributed Computing Infrastructure

In [None]:
# Initialize Dask distributed computing cluster
client = create_optimized_dask_client(
    n_workers=4,
    threads_per_worker=2,
    memory_limit='4GB',
    dashboard_port=8787,
    processes=True
)

print(f"Dask cluster ready at: http://localhost:8787/status")
# Display cluster configuration
client

## 3. Data Collection from Multiple Sources

In [None]:
# Initialize data collector with our configuration
collector = CryptoDataCollector(
    timezone=TIMEZONE,
    top_n=TOP_N,
    lookback_days=LOOKBACK_DAYS,
    frequency=FREQUENCY
)

print(f"Data Collector Configuration:")
print(f"  Frequency: {collector.FREQUENCY}")
print(f"  Lookback Days: {collector.LOOKBACK_DAYS}")

print(f"\nAPI Frequency Resolutions:")
print(f"  Pandas: {collector.get_pandas_freq()}")
print(f"  Binance: {collector.get_binance_interval()}")
print(f"  Deribit: {collector.get_deribit_resolution()}")
print(f"  FRED: {collector.get_fred_frequency()}")
print(f"  Dune: {collector.get_dune_resolution()}")

# Check batch sizes - should reflect the actual timeframe
batch_size = collector.get_batch_size_for_frequency()
print(f"\nBatch Sizes (based on {LOOKBACK_DAYS} days at {FREQUENCY} frequency):")
print(f"  Dune Batch Size: {batch_size}")
print(f"  Calculation: {LOOKBACK_DAYS} days × {24 if FREQUENCY in ['1H', '1h', 'hourly'] else 1} = {batch_size}")

# Collect all data sources
print(f"\nStarting comprehensive data collection at {FREQUENCY} frequency...")
data_sources = collector.collect_all_data()

# Display collected data info
print("\nData Collection Summary:")
for source, df in data_sources.items():
    if not df.empty:
        print(f"  {source}: {df.shape} | {df.index.min().date()} to {df.index.max().date()}")
        print(f"    Columns: {list(df.columns[:5])}{'...' if len(df.columns) > 5 else ''}")
    else:
        print(f"  {source}: Empty DataFrame")
    print()

In [None]:
# Combine all data sources
unified_data = collector.combine_data_sources(data_sources)

print(f"Unified dataset shape: {unified_data.shape}")
print(f"Date range: {unified_data.index.min()} to {unified_data.index.max()}")
print(f"Columns: {list(unified_data.columns[:10])}...")  # Show first 10 columns

# Display recent data
unified_data.tail(10)

## 4. Feature Engineering and Target Creation

In [None]:
# Initialize feature engineer
engineer = CryptoFeatureEngineer(
    time_window=14,  # Rolling window for TSFresh features
    random_seed=RANDOM_SEED
)

# Prepare features and target variable
print(f"Preparing features and target for {TARGET_COIN}...")
X, y = engineer.prepare_target_variable(
    unified_data, 
    target_coin=TARGET_COIN
)

print(f"Base features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target ({TARGET_COIN} realized volatility) statistics:")
print(f"   Mean: {y.mean():.6f}")
print(f"   Std: {y.std():.6f}")
print(f"   Min: {y.min():.6f}")
print(f"   Max: {y.max():.6f}")

In [None]:
# Add technical analysis indicators
print("📈 Computing technical analysis indicators...")
ta_indicators = engineer.compute_ta_indicators(X, price_prefix="prices_")

if not ta_indicators.empty:
    # Combine with base features
    X_with_ta = X.join(ta_indicators, how='left').dropna()
    
    # Align indices
    common_idx = X_with_ta.index.intersection(y.index)
    X = X_with_ta.loc[common_idx]
    y = y.loc[common_idx]
    
    print(f"Added {ta_indicators.shape[1]} technical indicators")
    print(f"Features with TA shape: {X.shape}")
else:
    print("No technical indicators computed (TA-Lib may not be available)")

# Display some technical indicators
ta_cols = [col for col in X.columns if any(indicator in col for indicator in ['rsi', 'macd', 'sma', 'ema'])]
if ta_cols:
    print(f"Technical indicators sample: {ta_cols[:5]}")

## 5. TSFresh Feature Engineering with Dask

In [None]:
# Run TSFresh pipeline with Dask
print("🧠 Starting TSFresh feature extraction with Dask...")
tsfresh_features = engineer.run_tsfresh_pipeline(X, y, client)

if not tsfresh_features.empty:
    print(f"TSFresh features extracted: {tsfresh_features.shape}")
    print(f"Sample TSFresh features: {list(tsfresh_features.columns[:5])}")
else:
    print("No TSFresh features extracted")

In [None]:
# Create final feature set
print("🎯 Creating final feature set...")
final_features = engineer.create_final_feature_set(
    X_base=X,
    y=y,
    tsfresh_features=tsfresh_features,
    include_ta_indicators=True
)

print(f"Final feature set shape: {final_features.shape}")
print(f"Features ready for ML pipeline")

# Show feature breakdown
feature_cols = final_features.drop('target', axis=1).columns
tsfresh_count = len([col for col in feature_cols if '__' in col])  # TSFresh features contain '__'
base_count = len(feature_cols) - tsfresh_count

print(f"Feature breakdown:")
print(f"   Base + TA features: {base_count}")
print(f"   TSFresh features: {tsfresh_count}")
print(f"   Total features: {len(feature_cols)}")

## 6. Machine Learning Pipeline with Optuna + XGBoost

In [None]:
# Initialize ML pipeline
ml_pipeline = CryptoVolatilityMLPipeline(
    n_trials=N_TRIALS,
    n_rounds=N_ROUNDS,
    eval_metric=EVAL_METRIC,
    tree_method='hist',
    early_stopping_rounds=25,
    splits=5,
    random_seed=RANDOM_SEED
)

print(f"ML Pipeline initialized:")
print(f"   Trials: {N_TRIALS}")
print(f"   Metric: {EVAL_METRIC}")
print(f"   Training Rounds: {N_ROUNDS}")
print(f"   Cross-validation Splits: 5")

In [None]:
# Run complete ML pipeline
print("Starting complete ML pipeline...")
print("This will take several minutes for hyperparameter optimization...")

ml_results = ml_pipeline.run_complete_pipeline(
    final_features=final_features,
    client=client,
    target_coin=TARGET_COIN,
    optimize=True
)

print("ML Pipeline completed!")

## 7. Results Analysis and Visualization

In [None]:
# Extract results
study = ml_results['study']
final_model = ml_results['final_model']
metrics = ml_results['metrics']
y_test_pd = ml_results['y_test_pd']
predictions_pd = ml_results['predictions_pd']

# Display optimization results
print("Hyperparameter Optimization Results:")
print("=" * 50)
print(f"Best parameters: {study.best_params}")
print(f"Best {config.ml.eval_metric}: {study.best_value:.6f}")
print(f"Number of trials: {len(study.trials)}")

# Display model performance
print("\nModel Performance Metrics:")
print("=" * 50)
for metric, value in metrics.items():
    print(f"{metric.upper()}: {value:.6f}")

In [None]:
# Additional analysis
print("Additional Analysis:")
print("=" * 30)

# Feature importance (if available)
try:
    importance = final_model['booster'].get_score(importance_type='weight')
    top_features = sorted(importance.items(), key=lambda x: x[1], reverse=True)[:10]
    
    print("🏆 Top 10 Most Important Features:")
    for i, (feature, score) in enumerate(top_features, 1):
        print(f"  {i:2d}. {feature}: {score}")
except Exception as e:
    print(f"Could not extract feature importance: {e}")

# Prediction statistics
print(f"\n📊 Prediction Statistics:")
print(f"Prediction mean: {predictions_pd.mean():.6f}")
print(f"Prediction std: {predictions_pd.std():.6f}")
print(f"Actual mean: {y_test_pd.mean():.6f}")
print(f"Actual std: {y_test_pd.std():.6f}")

# Correlation
correlation = predictions_pd.corr(y_test_pd)
print(f"Correlation: {correlation:.6f}")

## 8. Model Interpretation and Insights

In [None]:
# Model insights
print("Model Insights:")
print("=" * 40)

# Best parameters interpretation
best_params = study.best_params
print(f"Optimal learning rate: {best_params.get('learning_rate', 'N/A'):.4f}")
print(f"Optimal max depth: {best_params.get('max_depth', 'N/A')}")
print(f"Optimal subsample: {best_params.get('subsample', 'N/A'):.3f}")
print(f"Optimal colsample_bytree: {best_params.get('colsample_bytree', 'N/A'):.3f}")

# Model complexity
n_estimators = best_params.get('num_boost_rounds', config.ml.n_rounds)
max_depth = best_params.get('max_depth', 6)
complexity_score = n_estimators * max_depth / 1000
print(f"\nModel complexity score: {complexity_score:.3f}")

# Performance vs baseline
naive_mae = metrics.get('mase', float('inf'))
if naive_mae < 1.0:
    print(f"Model beats naive forecast (MASE: {naive_mae:.3f})")
else:
    print(f"Model underperforms naive forecast (MASE: {naive_mae:.3f})")

# R² interpretation
r2 = metrics['r2_score']
if r2 > 0.5:
    print(f"Good explanatory power (R²: {r2:.3f})")
elif r2 > 0.2:
    print(f"Moderate explanatory power (R²: {r2:.3f})")
else:
    print(f"Low explanatory power (R²: {r2:.3f})")

## 9. Cleanup and Summary

In [None]:
# Final summary
print("PIPELINE EXECUTION SUMMARY")
print("=" * 60)

print(f"Target: {TARGET_COIN}")
print(f"Frequency: {FREQUENCY}")
print(f"Data points: {len(final_features)}")
print(f"Features: {final_features.shape[1] - 1}")
print(f"Optimization trials: {len(study.trials)}")
print(f"Best {EVAL_METRIC.upper()}: {study.best_value:.6f}")
print(f"Test R²: {metrics['r2_score']:.6f}")
if 'mase' in metrics:
    print(f"Test MASE: {metrics['mase']:.6f}")

# Feature breakdown
feature_cols = final_features.drop('target', axis=1).columns
tsfresh_features = [col for col in feature_cols if '__' in col]
base_features = [col for col in feature_cols if '__' not in col]

print(f"\nFeature Engineering:")
print(f"Base + TA features: {len(base_features)}")
print(f"TSFresh features: {len(tsfresh_features)}")

print(f"\nComputational Resources:")
print(f"Dask workers: 4")
print(f"Memory per worker: 4GB")

print("\nPipeline execution completed successfully!")

In [None]:
# Cleanup Dask client
cleanup_dask_client(client)
print("Dask client cleaned up")
print("\nAll done! Your cryptocurrency volatility forecasting model is ready.")