In [1]:
"""
Configuration file for XGBoost pipeline
Contains all paths, parameters, and settings
"""

from pathlib import Path

# ============================================================================
# FILE PATHS
# ============================================================================
RAW_DATA_PATH = Path("C:/Users/wdkal/iex_data/book_snapshots")
OUTPUT_DIR = Path("C:/Users/wdkal/Downloads/IE421_XGBOOST_DATA")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ============================================================================
# DATA PARAMETERS
# ============================================================================
ALL_DATES = ['20251020', '20251021', '20251022', '20251023', '20251024']
TRAIN_DATES = ['20251020', '20251021', '20251022']
VAL_DATES = ['20251023']
TEST_DATES = ['20251024']

# Trading hours filter (format: "HH:MM")
START_TIME = "14:30"
END_TIME = "21:00"

# ============================================================================
# FEATURE ENGINEERING PARAMETERS
# ============================================================================
LABEL_HORIZON = 23  # Events ahead for label creation

# Feature selection parameters
MI_EVENTS_AHEAD = 10  # Events ahead for Mutual Information calculation
MAX_CORRELATION = 0.85  # Maximum allowed correlation between features

# Moving average windows (in events)
MA_WINDOW_1S = 1000  # ~1 second window
MA_WINDOW_5S = 5000  # ~5 second window

# Volatility windows
VOL_WINDOWS = [10, 100, 1000, 5000]  # Various time horizons

# RSI parameters
RSI_PERIOD = 14

# EMA parameters
EMA_FAST_SPAN = 12
EMA_SLOW_SPAN = 26

# ============================================================================
# MODEL PARAMETERS
# ============================================================================
RANDOM_STATE = 42

# XGBoost base parameters
XGBOOST_BASE_PARAMS = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'random_state': RANDOM_STATE,
    'eval_metric': 'mlogloss',
    'early_stopping_rounds': 20
}

# ============================================================================
# HYPERPARAMETER TUNING GRIDS
# ============================================================================

# Stage 1: Coarse search (32 combinations)
STAGE1_GRID = {
    'max_depth': [4, 7],
    'learning_rate': [0.05, 0.15],
    'n_estimators': [150, 250],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'min_child_weight': [1, 5],
    'gamma': [0, 0.1]
}

# Stage 2: Fine-tuning parameters
# Note: These will be dynamically generated around Stage 1 best params
STAGE2_REFINEMENT = {
    'max_depth_range': 1,  # +/- around best
    'learning_rate_multiplier': [0.8, 1.0, 1.2],
    'n_estimators_step': 50,  # +/- around best
    'subsample_options': [0.75, 0.8, 0.85],
    'colsample_bytree_options': [0.75, 0.8, 0.85]
}

# ============================================================================
# OUTPUT FILE NAMES
# ============================================================================
OUTPUT_FILES = {
    'processed_data': 'processed_features.pkl',
    'selected_features': 'selected_features.pkl',
    'scaler': 'scaler.pkl',
    'model': 'xgb_best_model.json',
    'training_history': 'xgb_training_history.json',
    'feature_importance': 'feature_importance.csv',
    'train_predictions': 'xgb_train_predictions.csv',
    'val_predictions': 'xgb_val_predictions.csv',
    'test_predictions': 'xgb_test_predictions.csv'
}