In [2]:
"""
STEP 2: Combine predictions into RL train/val/test splits
"""

import pandas as pd
import numpy as np
import pickle
from pathlib import Path

# ============================================================
# PATHS
# ============================================================
PRED_PATH = Path("C:/Users/wdkal/Downloads/NEW_PREDICTIONS")
OUTPUT_PATH = Path("C:/Users/wdkal/Downloads/RL_INPUTS")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

# ============================================================
# DATA SPLITS
# ============================================================
splits = {
    'train': ['20251027', '20251028', '20251029', '20251030', '20251031', '20251103'],
    'val': ['20251104', '20251105'],
    'test': ['20251106', '20251107']
}

print("="*60)
print("COMBINING PREDICTIONS FOR RL")
print("="*60)
print(f"\nData splits:")
print(f"  Train: Days 6-11  ({len(splits['train'])} days)")
print(f"  Val:   Days 12-13 ({len(splits['val'])} days)")
print(f"  Test:  Days 14-15 ({len(splits['test'])} days)")

# ============================================================
# COMBINE ONE SPLIT
# ============================================================
def combine_split(dates, split_name):
    print(f"\n{'-'*60}")
    print(f"Combining {split_name.upper()} split...")
    print(f"{'-'*60}")
    
    all_data = []
    missing_files = []
    
    for date in dates:
        file_path = PRED_PATH / f"predictions_{date}.csv"
        
        if not file_path.exists():
            print(f"  WARNING: {file_path} not found!")
            missing_files.append(date)
            continue
        
        try:
            df = pd.read_csv(file_path)
            all_data.append(df)
            print(f"  {date}: {len(df):,} events")
        except Exception as e:
            print(f"  ERROR reading {date}: {e}")
            missing_files.append(date)
    
    if len(all_data) == 0:
        print(f"\nNo valid data found for {split_name} split!")
        return None
    
    combined = pd.concat(all_data, ignore_index=True)
    print(f"\nCombined total: {len(combined):,} events")
    
    rl_input = {
        'predictions': {
            'xgb': combined[['xgb_prob_down', 'xgb_prob_neutral', 'xgb_prob_up']].values,
            'lstm': combined[['lstm_prob_down', 'lstm_prob_neutral', 'lstm_prob_up']].values,
            'tcn': combined[['tcn_prob_down', 'tcn_prob_neutral', 'tcn_prob_up']].values,
            'transformer': combined[['transformer_prob_down', 'transformer_prob_neutral', 
                                   'transformer_prob_up']].values
        },
        'actual_labels': combined['actual'].values,
        'num_events': len(combined),
        'dates': [d for d in dates if d not in missing_files]
    }
    
    # Save
    output_file = OUTPUT_PATH / f"rl_input_{split_name}.pkl"
    with open(output_file, 'wb') as f:
        pickle.dump(rl_input, f)
    
    print(f"\nSaved: {output_file}")
    return rl_input

# ============================================================
# GENERATE ALL SPLITS
# ============================================================
results = {}
for split_name, dates in splits.items():
    result = combine_split(dates, split_name)
    results[split_name] = result

print("\n" + "="*60)
print("COMPLETE!")
print("="*60)

COMBINING PREDICTIONS FOR RL

Data splits:
  Train: Days 6-11  (6 days)
  Val:   Days 12-13 (2 days)
  Test:  Days 14-15 (2 days)

------------------------------------------------------------
Combining TRAIN split...
------------------------------------------------------------
  20251027: 613,485 events
  20251028: 219,547 events
  20251029: 1,252,954 events
  20251030: 1,478,295 events
  20251031: 787,150 events
  20251103: 290,204 events

Combined total: 4,641,635 events

Saved: C:\Users\wdkal\Downloads\RL_INPUTS\rl_input_train.pkl

------------------------------------------------------------
Combining VAL split...
------------------------------------------------------------
  20251104: 483,590 events
  20251105: 916,945 events

Combined total: 1,400,535 events

Saved: C:\Users\wdkal\Downloads\RL_INPUTS\rl_input_val.pkl

------------------------------------------------------------
Combining TEST split...
------------------------------------------------------------
  20251106: 1,253,8