In [2]:
"""
Data preparation and feature engineering module
Loads raw order book data and creates features
"""

import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

from config import *


def fetch_orderbook_data(dates=None):
    """
    Load IEX order book snapshots from multiple dates.
    
    Args:
        dates: List of date strings (format: 'YYYYMMDD'). 
               If None, uses ALL_DATES from config.
    
    Returns:
        DataFrame with combined order book data
    """
    if dates is None:
        dates = ALL_DATES
    
    print("="*60)
    print("LOADING RAW ORDER BOOK DATA")
    print("="*60)
    
    dfs = []
    for date in dates:
        file_path = RAW_DATA_PATH / f'{date}_book_updates.csv.gz'
        print(f"  Loading {date}...")
        
        df = pd.read_csv(file_path, compression='gzip')
        df['date'] = date
        df['COLLECTION_TIME'] = pd.to_datetime(df['COLLECTION_TIME'])
        df = df.set_index('COLLECTION_TIME')
        df = df.between_time(START_TIME, END_TIME)
        df = df.reset_index()
        
        dfs.append(df)
    
    combined = pd.concat(dfs, ignore_index=True)
    print(f"Total events loaded: {len(combined):,}")
    
    return combined


def add_all_features(df):
    """
    Create comprehensive order book features.
    
    Args:
        df: Raw order book DataFrame
    
    Returns:
        DataFrame with engineered features
    """
    print("\n" + "="*60)
    print("CREATING FEATURES")
    print("="*60)
    
    features = pd.DataFrame()
    features['date'] = df['date']
    
    # ========================================================================
    # BASIC LEVEL-1 FEATURES
    # ========================================================================
    features["mid_price"] = (df["BID_PRICE_1"] + df["ASK_PRICE_1"]) / 2
    features["microprice"] = (
        df["BID_PRICE_1"] * df["ASK_SIZE_1"] + 
        df["ASK_PRICE_1"] * df["BID_SIZE_1"]
    ) / (df["BID_SIZE_1"] + df["ASK_SIZE_1"] + 1e-10)
    
    features["spread"] = df["ASK_PRICE_1"] - df["BID_PRICE_1"]
    features["vol_imbalance"] = (
        (df["BID_SIZE_1"] - df["ASK_SIZE_1"]) / 
        (df["BID_SIZE_1"] + df["ASK_SIZE_1"] + 1e-6)
    )
    features["bid_ask_spread_ratio"] = features["spread"] / features["mid_price"]
    
    # ========================================================================
    # ALL LEVEL PRICES AND SIZES
    # ========================================================================
    for level in range(1, 4):
        features[f"BID_PRICE_{level}"] = df[f"BID_PRICE_{level}"]
        features[f"BID_SIZE_{level}"] = df[f"BID_SIZE_{level}"]
        features[f"ASK_PRICE_{level}"] = df[f"ASK_PRICE_{level}"]
        features[f"ASK_SIZE_{level}"] = df[f"ASK_SIZE_{level}"]
    
    # ========================================================================
    # AGGREGATE STATISTICS ACROSS LEVELS
    # ========================================================================
    features["bid_price_mean"] = (
        df["BID_PRICE_1"] + df["BID_PRICE_2"] + df["BID_PRICE_3"]
    ) / 3
    features["ask_price_mean"] = (
        df["ASK_PRICE_1"] + df["ASK_PRICE_2"] + df["ASK_PRICE_3"]
    ) / 3
    features["bid_qty_mean"] = (
        df["BID_SIZE_1"] + df["BID_SIZE_2"] + df["BID_SIZE_3"]
    ) / 3
    features["ask_qty_mean"] = (
        df["ASK_SIZE_1"] + df["ASK_SIZE_2"] + df["ASK_SIZE_3"]
    ) / 3
    
    # Cumulative differences
    features["price_cum_diff"] = (
        (df["ASK_PRICE_1"] - df["BID_PRICE_1"]) + 
        (df["ASK_PRICE_2"] - df["BID_PRICE_2"]) + 
        (df["ASK_PRICE_3"] - df["BID_PRICE_3"])
    )
    features["qty_cum_diff"] = (
        (df["ASK_SIZE_1"] - df["BID_SIZE_1"]) + 
        (df["ASK_SIZE_2"] - df["BID_SIZE_2"]) + 
        (df["ASK_SIZE_3"] - df["BID_SIZE_3"])
    )
    
    # ========================================================================
    # PRICE MOMENTUM
    # ========================================================================
    features["mid_diff"] = features["mid_price"].diff()
    features["mid_return"] = features["mid_diff"] / features["mid_price"].shift(1)
    
    # ========================================================================
    # ORDER FLOW IMBALANCE (OFI)
    # ========================================================================
    total_bid_qty = df["BID_SIZE_1"] + df["BID_SIZE_2"] + df["BID_SIZE_3"]
    total_ask_qty = df["ASK_SIZE_1"] + df["ASK_SIZE_2"] + df["ASK_SIZE_3"]
    
    bid_qty_change = total_bid_qty.diff()
    ask_qty_change = total_ask_qty.diff()
    features["OFI"] = bid_qty_change - ask_qty_change
    
    # ========================================================================
    # MOVING AVERAGES
    # ========================================================================
    features["mv_1s"] = features["mid_price"].rolling(
        MA_WINDOW_1S, min_periods=1
    ).mean()
    features["mv_5s"] = features["mid_price"].rolling(
        MA_WINDOW_5S, min_periods=1
    ).mean()
    
    # ========================================================================
    # VOLATILITY
    # ========================================================================
    for window in VOL_WINDOWS:
        features[f"vol_{window}"] = features["mid_return"].rolling(
            window, min_periods=1
        ).std()
    
    # ========================================================================
    # RSI (RELATIVE STRENGTH INDEX)
    # ========================================================================
    delta = features["microprice"].diff()
    gain = delta.where(delta > 0, 0).rolling(RSI_PERIOD, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(RSI_PERIOD, min_periods=1).mean()
    rs = gain / (loss + 1e-10)
    features["rsi_14"] = 100 - (100 / (1 + rs))
    
    # ========================================================================
    # EXPONENTIAL MOVING AVERAGES
    # ========================================================================
    features["ema_fast"] = features["mid_price"].ewm(
        span=EMA_FAST_SPAN, adjust=False
    ).mean()
    features["ema_slow"] = features["mid_price"].ewm(
        span=EMA_SLOW_SPAN, adjust=False
    ).mean()
    features["ema_diff"] = features["ema_fast"] - features["ema_slow"]
    
    # ========================================================================
    # PLACEHOLDER FEATURES
    # ========================================================================
    features["time_delta"] = 1  # Placeholder for actual time deltas
    
    # ========================================================================
    # CLEAN UP
    # ========================================================================
    features = features.ffill().fillna(0)
    features.replace([np.inf, -np.inf], np.nan, inplace=True)
    features = features.ffill().fillna(0)
    
    print(f"Created {len(features.columns)-1} features (excluding 'date')")
    
    return features


def add_labels(features, horizon=None):
    """
    Create price movement labels.
    
    Args:
        features: DataFrame with features
        horizon: Number of events ahead to predict (uses LABEL_HORIZON if None)
    
    Returns:
        DataFrame with 'target' column added
            0 = down, 1 = neutral, 2 = up
    """
    if horizon is None:
        horizon = LABEL_HORIZON
    
    print(f"\nCreating labels (horizon = {horizon} events ahead)...")
    
    features['future_price'] = features['microprice'].shift(-horizon)
    price_change = features['future_price'] - features['microprice']
    
    features['target'] = 1  # neutral (no change)
    features.loc[price_change > 0, 'target'] = 2  # up
    features.loc[price_change < 0, 'target'] = 0  # down
    
    features.drop('future_price', axis=1, inplace=True)
    
    # Print label distribution
    counts = features['target'].value_counts().sort_index()
    total = len(features)
    print(f"Label distribution:")
    print(f"  Down (0):    {counts.get(0, 0):,} ({counts.get(0, 0)/total*100:.2f}%)")
    print(f"  Neutral (1): {counts.get(1, 0):,} ({counts.get(1, 0)/total*100:.2f}%)")
    print(f"  Up (2):      {counts.get(2, 0):,} ({counts.get(2, 0)/total*100:.2f}%)")
    
    return features


def save_processed_data(features):
    """Save processed features to disk."""
    output_path = OUTPUT_DIR / OUTPUT_FILES['processed_data']
    joblib.dump(features, output_path)
    print(f"\nâœ“ Saved processed data to: {output_path}")


def main():
    """Main data preparation pipeline."""
    print("="*60)
    print("DATA PREPARATION PIPELINE")
    print("="*60)
    
    # Load raw data
    df = fetch_orderbook_data()
    
    # Create features
    features = add_all_features(df)
    
    # Add labels
    features = add_labels(features)
    
    # Save processed data
    save_processed_data(features)
    
    print("\n" + "="*60)
    print("DATA PREPARATION COMPLETE")
    print("="*60)
    print(f"Output: {OUTPUT_DIR / OUTPUT_FILES['processed_data']}")
    print(f"Shape: {features.shape}")
    print(f"Features: {len([c for c in features.columns if c not in ['date', 'target']])}")
    

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'config'