In [None]:
# Standard Libraries
import os
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, Tuple, List, Optional
import warnings
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F

# Ray and RLlib
import ray
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models import ModelCatalog
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.registry import register_env
from gymnasium import spaces

# Data processing and ML
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import joblib
import numba as nb
from numba import jit, prange

# Set seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print(f"PyTorch version: {torch.__version__}")
print(f"Ray version: {ray.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print("Environment setup complete!")

PyTorch version: 2.3.1
Ray version: 2.9.3
NumPy version: 1.26.4
Pandas version: 2.3.0
Environment setup complete!


In [None]:
import pandas as pd
import numpy as np
import numba as nb
from numba import prange
import time

# --- OPTIMIZATION 1: Numba-JIT function for faster sigma estimation ---
@nb.jit(nopython=True, parallel=True)
def _compute_dists_sq_numba(data: np.ndarray) -> np.ndarray:
    """
    Optimized calculation of pairwise squared Euclidean distances.
    This replaces a slow Python loop.
    """
    n_samples = data.shape[0]
    # Pre-allocate array for results
    dists_sq = np.zeros(n_samples * (n_samples - 1) // 2, dtype=np.float64)
    k = 0
    for i in prange(n_samples):
        for j in range(i + 1, n_samples):
            diff = data[i] - data[j]
            dists_sq[k] = np.sum(diff * diff)
            k += 1
    return dists_sq

# Core MMD math (already optimized with Numba, no changes needed)
@nb.jit(nopython=True)
def gaussian_kernel_numba(x: np.ndarray, y: np.ndarray, sigma: float) -> float:
    diff = x - y
    return np.exp(-np.sum(diff * diff) / (2.0 * sigma * sigma))

@nb.jit(nopython=True, parallel=True)
def compute_mmd_squared_numba(X: np.ndarray, Y: np.ndarray, sigma: float) -> float:
    n_x, n_y = X.shape[0], Y.shape[0]
    K_XX_sum = 0.0
    if n_x > 1:
        for i in prange(n_x):
            for j in range(n_x):
                if i != j: K_XX_sum += gaussian_kernel_numba(X[i], X[j], sigma)
        K_XX = K_XX_sum / (n_x * (n_x - 1))
    else: K_XX = 0.0
    K_YY_sum = 0.0
    if n_y > 1:
        for i in prange(n_y):
            for j in range(n_y):
                if i != j: K_YY_sum += gaussian_kernel_numba(Y[i], Y[j], sigma)
        K_YY = K_YY_sum / (n_y * (n_y - 1))
    else: K_YY = 0.0
    K_XY_sum = 0.0
    if n_x > 0 and n_y > 0:
        for i in prange(n_x):
            for j in range(n_y):
                K_XY_sum += gaussian_kernel_numba(X[i], Y[j], sigma)
        K_XY = K_XY_sum / (n_x * n_y)
    else: K_XY = 0.0
    return max(0.0, K_XX + K_YY - 2.0 * K_XY)

def calculate_mmd_and_label_regimes(df: pd.DataFrame, reference_window: int = 500, test_window: int = 100, stride: int = 10) -> pd.DataFrame:
    """Calculate MMD scores and assign regime labels"""
    df_processed = df.copy()
    df_processed.columns = [col.strip().lower() for col in df_processed.columns]

    # Feature engineering (already fast using pandas' vectorized operations)
    required_cols = ['open', 'high', 'low', 'close', 'volume']
    for col in required_cols:
        if col not in df_processed.columns: raise ValueError(f"Missing required column: {col}")
        if not pd.api.types.is_numeric_dtype(df_processed[col]):
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')

    df_processed['returns'] = df_processed['close'].pct_change()
    df_processed['log_returns'] = np.log(df_processed['close'] / df_processed['close'].shift(1))
    df_processed['range'] = (df_processed['high'] - df_processed['low']) / df_processed['close'].shift(1)
    df_processed['volume_ma'] = df_processed['volume'].rolling(window=20, min_periods=1).mean()
    df_processed['volatility'] = df_processed['returns'].rolling(window=20, min_periods=1).std() * np.sqrt(252 * 48)
    df_processed['momentum_20'] = df_processed['close'].pct_change(periods=20)
    df_processed['momentum_50'] = df_processed['close'].pct_change(periods=50)
    df_processed['volume_ratio'] = df_processed['volume'] / (df_processed['volume_ma'] + 1e-9)
    df_processed = df_processed.dropna().copy()

    if df_processed.empty:
        print("Warning: DataFrame empty after feature calculation.")
        return df_processed

    features_for_mmd = ['returns', 'log_returns', 'range', 'volatility']
    data = df_processed[features_for_mmd].values

    mmd_scores, timestamps = [], []

    if len(data) >= reference_window + test_window:
        ref_data_full = data[:reference_window]
        ref_mean, ref_std = np.mean(ref_data_full, axis=0), np.std(ref_data_full, axis=0)
        ref_std[ref_std < 1e-8] = 1e-8
        ref_data_norm = (ref_data_full - ref_mean) / ref_std

        # --- Sigma estimation now uses the optimized Numba function ---
        n_samples = min(200, ref_data_norm.shape[0])
        sigma = 1.0
        if n_samples > 1:
            sample_indices = np.random.choice(ref_data_norm.shape[0], n_samples, replace=False)
            sampled_data = ref_data_norm[sample_indices]
            dists_sq = _compute_dists_sq_numba(sampled_data)
            if dists_sq.size > 0:
                median_dist_sq = np.median(dists_sq)
                if median_dist_sq > 0:
                    sigma = np.sqrt(median_dist_sq)

        print(f"Estimated kernel bandwidth (sigma): {sigma:.4f}")

        for i in range(reference_window, len(data) - test_window + 1, stride):
            test_data_norm = (data[i:i+test_window] - ref_mean) / ref_std
            mmd = np.sqrt(compute_mmd_squared_numba(ref_data_norm, test_data_norm, sigma))
            mmd_scores.append(mmd)
            timestamps.append(df_processed.index[i + test_window - 1])

    if mmd_scores:
        mmd_series = pd.Series(mmd_scores, index=pd.DatetimeIndex(timestamps), name='mmd_score')
        df_processed['mmd_score'] = mmd_series.reindex(df_processed.index).ffill().bfill()
    else:
        df_processed['mmd_score'] = np.nan

    # --- Regime labeling now uses the highly optimized Numba implementation ---
    df_processed = assign_regime_labels_optimized(df_processed)
    return df_processed

# --- OPTIMIZATION 2: Numba-JIT function for the entire regime labeling loop ---
@nb.jit(nopython=True, parallel=True)
def _assign_regime_labels_numba(mmd_scores: np.ndarray, momentum_scores: np.ndarray, lookback_window: int) -> np.ndarray:
    """
    Numba-optimized regime labeling loop. Replaces a very slow pandas iloc loop.
    Operates entirely on NumPy arrays for maximum speed.
    """
    n = len(mmd_scores)
    regimes = np.full(n, -1, dtype=np.int64)

    for i in prange(lookback_window, n):
        current_mmd, current_momentum = mmd_scores[i], momentum_scores[i]
        if np.isnan(current_mmd) or np.isnan(current_momentum): continue

        hist_window_mmd = mmd_scores[i - lookback_window : i]
        hist_window_mom = momentum_scores[i - lookback_window : i]

        mmd_le_count, mmd_valid_count = 0, 0
        for val in hist_window_mmd:
            if not np.isnan(val):
                mmd_valid_count += 1
                if val <= current_mmd: mmd_le_count += 1

        mom_le_count, mom_valid_count = 0, 0
        for val in hist_window_mom:
            if not np.isnan(val):
                mom_valid_count += 1
                if val <= current_momentum: mom_le_count += 1

        if mmd_valid_count == 0 or mom_valid_count == 0: continue

        mmd_p = mmd_le_count / mmd_valid_count
        mom_p = mom_le_count / mom_valid_count
        score = 0.4 * (1.0 - mmd_p) + 0.6 * mom_p

        if score >= 0.95: regime = 6
        elif score >= 0.80: regime = 5
        elif score >= 0.60: regime = 4
        elif score >= 0.40: regime = 3
        elif score >= 0.20: regime = 2
        elif score >= 0.05: regime = 1
        else: regime = 0
        regimes[i] = regime

    return regimes

def assign_regime_labels_optimized(df: pd.DataFrame, lookback_window: int = 1000) -> pd.DataFrame:
    """Wrapper function for the optimized regime labeling."""
    df_labeled = df.copy()
    if 'mmd_score' not in df_labeled.columns or 'momentum_20' not in df_labeled.columns:
        df_labeled['regime_name'] = "Unassigned"
        return df_labeled

    # Extract NumPy arrays to pass to the Numba function
    mmd_scores = df_labeled['mmd_score'].to_numpy()
    momentum_scores = df_labeled['momentum_20'].to_numpy()

    # Call the fast, JIT-compiled function
    regimes = _assign_regime_labels_numba(mmd_scores, momentum_scores, lookback_window)

    df_labeled['regime'] = regimes

    regime_map = {-1: 'Unassigned', 0: 'Extremely Bearish', 1: 'Bearish', 2: 'Neutral Towards Bearish', 3: 'Neutral', 4: 'Neutral Towards Bullish', 5: 'Bullish', 6: 'Extremely Bullish'}
    df_labeled['regime_name'] = df_labeled['regime'].map(regime_map).fillna('Unassigned')
    return df_labeled

# --- Main script execution (no changes needed here) ---
if __name__ == '__main__':
    df_prices = None
    file_path = r'C:\Users\Windows 11\Downloads\ES - 30 min - New.csv'

    try:
        df_prices_raw = pd.read_csv(file_path)
        print(f"Successfully loaded data from: {file_path}")
        df_prices_raw.columns = [col.strip().lower() for col in df_prices_raw.columns]
        if 'timestamp' not in df_prices_raw.columns:
            raise ValueError("'timestamp' column not found.")
        df_prices_raw['timestamp'] = pd.to_datetime(df_prices_raw['timestamp'], format='mixed')
        df_prices = df_prices_raw.set_index('timestamp').sort_index()
        print("Timestamp processed and set as index.")
    except Exception as e:
        print(f"An error occurred during data loading: {e}")

    if df_prices is not None:
        print("\nCalculating MMD scores and labeling regimes (Optimized)...")
        start_time = time.time()

        df_features = calculate_mmd_and_label_regimes(df_prices)

        end_time = time.time()
        print(f"--- Calculation finished in {end_time - start_time:.2f} seconds ---")

        print(f"\nProcessed data shape: {df_features.shape}")
        if not df_features.empty:
            print(f"Date range: {df_features.index[0]} to {df_features.index[-1]}")
            print(f"\nRegime distribution (counts):\n{df_features['regime_name'].value_counts().sort_index()}")
            print("\nSample of processed data (tail):")
            print(df_features.tail())
        else:
            print("Processed DataFrame is empty.")
    else:
        print("\nSkipping MMD calculation due to errors during data loading.")

Successfully loaded data from: C:\Users\Windows 11\Downloads\ES - 30 min - New.csv
Timestamp processed and set as index.

Calculating MMD scores and labeling regimes (Optimized)...
Estimated kernel bandwidth (sigma): 1.0546
--- Calculation finished in 76.69 seconds ---

Processed data shape: (59056, 16)
Date range: 2020-01-07 03:00:00 to 2025-12-05 23:30:00

Regime distribution (counts):
regime_name
Bearish                     4299
Bullish                     5981
Extremely Bearish            245
Extremely Bullish            745
Neutral                    17499
Neutral Towards Bearish    17358
Neutral Towards Bullish    11929
Unassigned                  1000
Name: count, dtype: int64

Sample of processed data (tail):
                        open     high      low    close  volume   returns  \
timestamp                                                                   
2025-12-05 21:30:00  5856.00  5862.25  5851.00  5851.50   56644 -0.000768   
2025-12-05 22:00:00  5851.75  5858.25  584

In [None]:
# Cell 3: Data Splitting and Feature Normalization
# -------------------------------------------------
# This cell takes the feature-engineered DataFrame (df_features),
# splits it into training, validation, and test sets,
# and then normalizes the features based on the training set statistics.

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler # Ensure this import is present

# --- Configuration ---
# Define feature columns for the model
# These are the columns that will be used as input to the ML model.
feature_columns = [
    'open', 'high', 'low', 'close', 'volume', # Basic OHLCV
    'returns', 'log_returns', 'range', 'volatility', # Derived financial indicators
    'momentum_20', 'momentum_50', # Momentum indicators
    'volume_ratio', # Volume-based feature
    'mmd_score' # MMD-based feature
]
target_column = 'regime' # The column we want to predict

# --- Input Data Check ---
# Ensure df_features is defined and not empty (should come from the previous cell)
if 'df_features' not in locals() or df_features.empty:
    print("Error: df_features is not defined or is empty. Please run the previous cells to generate it.")
    # In a notebook, you might raise an error or stop execution here
    # For example: raise ValueError("df_features not available. Run previous cells.")
else:
    print(f"Input df_features shape: {df_features.shape}")

    # --- Feature Selection ---
    # Ensure all defined feature_columns actually exist in df_features
    # This also handles cases where some features might not have been calculable (e.g., due to insufficient data)
    available_features = [col for col in feature_columns if col in df_features.columns]
    missing_features = [col for col in feature_columns if col not in df_features.columns]

    if missing_features:
        print(f"\nWarning: The following defined feature_columns are missing from df_features and will be excluded:")
        for mf in missing_features:
            print(f" - {mf}")

    if not available_features:
        raise ValueError("No features available for training. Check feature_columns and df_features content.")

    print(f"\nUsing {len(available_features)} available features for modeling: {available_features}")
    if target_column not in df_features.columns:
        raise ValueError(f"Target column '{target_column}' not found in df_features.")

    # --- Data Splitting (Chronological) ---
    # Splitting data chronologically is crucial for time-series forecasting tasks
    # to prevent data leakage from the future into the training set.
    n = len(df_features)
    if n < 10: # Arbitrary small number, adjust as needed
        raise ValueError(f"Insufficient data for splitting. Only {n} samples available.")

    train_end_idx = int(n * 0.70)
    val_end_idx = int(n * 0.85)

    # Use .copy() to avoid SettingWithCopyWarning later if modifications are made
    train_df = df_features.iloc[:train_end_idx].copy()
    val_df = df_features.iloc[train_end_idx:val_end_idx].copy()
    test_df = df_features.iloc[val_end_idx:].copy()

    print(f"\nData split sizes:")
    if not train_df.empty:
        print(f"Train: {len(train_df)} samples (from {train_df.index[0]} to {train_df.index[-1]})")
    else:
        print("Train: 0 samples (Warning: Training set is empty!)")

    if not val_df.empty:
        print(f"Val:   {len(val_df)} samples (from {val_df.index[0]} to {val_df.index[-1]})")
    else:
        print("Val:   0 samples (Warning: Validation set is empty!)")

    if not test_df.empty:
        print(f"Test:  {len(test_df)} samples (from {test_df.index[0]} to {test_df.index[-1]})")
    else:
        print("Test:  0 samples (Warning: Test set is empty!)")

    # --- Feature Normalization ---
    # StandardScaler standardizes features by removing the mean and scaling to unit variance.
    # It's important to fit the scaler ONLY on the training data and then use it to transform
    # the validation and test sets to prevent data leakage.

    # Handle potential NaNs in features before scaling.
    # Using fillna(0) is a simple strategy. Consider more sophisticated imputation
    # if zeros are not appropriate (e.g., mean/median from training set).
    # Ensure this fillna strategy is consistent with how you'd handle NaNs in production.

    scaler = StandardScaler()

    if not train_df.empty:
        # Fill NaNs before fitting and transforming.
        # Make sure 'available_features' and 'target_column' are present in train_df
        train_features_df = train_df[available_features].fillna(0)
        train_df[available_features] = scaler.fit_transform(train_features_df)

        # Save normalization parameters (mean, std_dev) for each feature from the training set.
        # These are crucial for transforming new data in production using the same scale.
        normalization_params = {
            'mean': scaler.mean_.tolist(), # Convert to list for JSON serializability if needed
            'std': scaler.scale_.tolist(),   # Convert to list
            'features': available_features # List of features that were scaled
        }
        print("\nNormalization parameters calculated and saved from training data.")
        print(f"Features normalized: {len(available_features)}")

        # Transform validation and test sets using the SAME scaler fitted on the training data.
        if not val_df.empty:
            val_features_df = val_df[available_features].fillna(0)
            val_df[available_features] = scaler.transform(val_features_df)
            print("Validation set features normalized.")

        if not test_df.empty:
            test_features_df = test_df[available_features].fillna(0)
            test_df[available_features] = scaler.transform(test_features_df)
            print("Test set features normalized.")

    else:
        print("\nWarning: Training set is empty. Skipping normalization.")
        normalization_params = None # Or handle as appropriate

    # --- Display sample of processed data (optional) ---
    if not train_df.empty:
        print("\nSample of processed training data (first 5 rows):")
        print(train_df[available_features + [target_column]].head())

    # The DataFrames (train_df, val_df, test_df) and normalization_params
    # are now ready for use in subsequent model training cells.

Input df_features shape: (59056, 16)

Using 13 available features for modeling: ['open', 'high', 'low', 'close', 'volume', 'returns', 'log_returns', 'range', 'volatility', 'momentum_20', 'momentum_50', 'volume_ratio', 'mmd_score']

Data split sizes:
Train: 41339 samples (from 2020-01-07 03:00:00 to 2023-11-21 04:30:00)
Val:   8858 samples (from 2023-11-21 05:00:00 to 2024-08-23 06:30:00)
Test:  8859 samples (from 2024-08-23 07:00:00 to 2025-12-05 23:30:00)

Normalization parameters calculated and saved from training data.
Features normalized: 13
Validation set features normalized.
Test set features normalized.

Sample of processed training data (first 5 rows):
                         open      high       low     close    volume  \
timestamp                                                               
2020-01-07 03:00:00 -2.375460 -2.384414 -2.370335 -2.373894 -0.519922   
2020-01-07 03:30:00 -2.373646 -2.375340 -2.363088 -2.370267 -0.559769   
2020-01-07 04:00:00 -2.369415 -2.372920

In [None]:
# Cell 4: Market Data Encoder Definition
# ---------------------------------------
# This cell defines a PyTorch nn.Module that serves as a standalone encoder.
# It takes a window of market data (with multiple features) and transforms it
# into a fixed-size vector representation. This encoded vector can then be
# used as input for downstream reinforcement learning agents or other models.

import torch
import torch.nn as nn

# Ensure 'available_features' is defined from a previous cell.
# If not, this cell will raise a NameError when 'test_encoder' is initialized.
# You might want to add an explicit check here if running cells out of order:
# if 'available_features' not in locals():
#     raise NameError("Variable 'available_features' is not defined. Please run the data preparation cells first.")

class MarketDataEncoder(nn.Module):
    """
    Standalone encoder that transforms market data windows into fixed-size vectors.
    This promotes modularity and enables end-to-end learning of representations.

    The architecture uses an LSTM to capture temporal dependencies in the input window,
    followed by fully connected layers to project the LSTM's output into the desired
    encoding space.
    """
    def __init__(self, input_size, hidden_size=128, num_layers=2, encoding_size=64, dropout_rate=0.1):
        """
        Args:
            input_size (int): Number of features in the input data (e.g., OHLCV + indicators).
            hidden_size (int): Number of features in the LSTM hidden state.
            num_layers (int): Number of recurrent layers in the LSTM.
            encoding_size (int): Dimensionality of the output encoded vector.
            dropout_rate (float): Dropout probability for regularization.
        """
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.encoding_size = encoding_size

        # LSTM layer for temporal pattern extraction from sequential data.
        # batch_first=True means input/output tensors are (batch, seq, feature).
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_rate if num_layers > 1 else 0 # Dropout only applied if num_layers > 1
        )

        # Projection layers to transform LSTM output to the final encoding size.
        # Using a two-layer MLP for potentially more complex transformations.
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2) # Intermediate layer
        self.fc2 = nn.Linear(hidden_size // 2, encoding_size) # Output layer

        # Activation function and dropout for regularization
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the encoder.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, window_size, num_features).
                              - batch_size: Number of samples in the batch.
                              - window_size: Length of the input sequence (time steps).
                              - num_features: Number of features at each time step.

        Returns:
            encoded_vector (torch.Tensor): Output tensor of shape (batch_size, encoding_size).
        """
        # LSTM encoding:
        # lstm_out shape: (batch_size, window_size, hidden_size)
        # h_n shape: (num_layers, batch_size, hidden_size) - final hidden state for each layer
        # c_n shape: (num_layers, batch_size, hidden_size) - final cell state for each layer
        lstm_out, (h_n, c_n) = self.lstm(x)

        # We typically use the output from the last time step of the LSTM sequence.
        # lstm_out[:, -1, :] gives the hidden state of the last LSTM unit for all batches.
        # This captures the summary of the entire sequence.
        last_hidden_state = lstm_out[:, -1, :]

        # Alternatively, one could use h_n[-1] (hidden state of the last layer at the last time step)
        # if num_layers > 1, which should be equivalent to lstm_out[:, -1, :] for the last layer.
        # last_hidden_state = h_n[-1]

        # Project the last hidden state to the final encoding space
        x = self.relu(self.fc1(last_hidden_state))
        x = self.dropout(x) # Apply dropout after activation
        encoded_vector = self.fc2(x)

        return encoded_vector

# --- Test the Encoder ---
# This section demonstrates how to instantiate and use the MarketDataEncoder.
# It's a good practice to test modules with sample data to verify shapes and basic functionality.

# Check if 'available_features' is defined to prevent NameError
if 'available_features' in locals():
    # Configuration for the test
    test_input_features = len(available_features) # Should match the number of features from data prep
    test_encoding_dims = 128 # Desired output dimension for the encoded vector

    # Instantiate the encoder
    try:
        test_encoder = MarketDataEncoder(input_size=test_input_features, encoding_size=test_encoding_dims)
        print(f"\nMarketDataEncoder instantiated successfully with input_size={test_input_features}, encoding_size={test_encoding_dims}.")

        # Create a dummy input tensor for testing
        # Shape: (batch_size, window_size, num_features)
        batch_size = 32
        window_size = 30 # Example window length (e.g., 30 time steps)

        # Ensure dummy input has the correct number of features
        dummy_input_tensor = torch.randn(batch_size, window_size, test_input_features)

        # Pass the dummy input through the encoder
        test_output_vector = test_encoder(dummy_input_tensor)

        print(f"Encoder test - Input shape: {dummy_input_tensor.shape}, Output shape: {test_output_vector.shape}")

        # Verify output shape
        expected_output_shape = (batch_size, test_encoding_dims)
        assert test_output_vector.shape == expected_output_shape, \
            f"Output shape mismatch! Expected {expected_output_shape}, got {test_output_vector.shape}"
        print("Encoder output shape is correct.")

    except Exception as e:
        print(f"Error during MarketDataEncoder test: {e}")
else:
    print("\nWarning: 'available_features' not found. Skipping MarketDataEncoder test. "
          "Please ensure previous data preparation cells have been run.")



MarketDataEncoder instantiated successfully with input_size=13, encoding_size=128.
Encoder test - Input shape: torch.Size([32, 30, 13]), Output shape: torch.Size([32, 128])
Encoder output shape is correct.


In [None]:
# Cell 5: RLlib Custom Model with MarketDataEncoder
# -------------------------------------------------
# This cell defines a custom model for RLlib that incorporates the
# MarketDataEncoder defined previously.
# The model takes a window of market data as input, uses the encoder
# to get a fixed-size representation, and then feeds this representation
# into separate fully connected layers for policy (action selection)
# and value estimation.

import torch
import torch.nn as nn
import torch.nn.functional as F # Added for F.relu
import gymnasium as gym # <--- CORRECTED: Import gymnasium
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import Dict, TensorType, List, ModelConfigDict

# Ensure MarketDataEncoder is available from the previous cell.
# If 'MarketDataEncoder' is not defined, this will raise a NameError.
# Make sure the cell defining MarketDataEncoder has been executed.
# Example:
# if 'MarketDataEncoder' not in globals():
#     raise NameError("MarketDataEncoder not found. Please run the cell defining it.")


class EncoderRLModel(TorchModelV2, nn.Module):
    """
    RLlib TorchModelV2 that uses a MarketDataEncoder for observation processing.

    Assumes observations are windows of shape (window_size, num_features).
    The MarketDataEncoder processes this window into a fixed-size encoding,
    which is then used by policy and value heads.
    """
    def __init__(self,
                 obs_space: gym.spaces.Space, # Now gym.spaces.Space is recognized
                 action_space: gym.spaces.Space,
                 num_outputs: int,
                 model_config: ModelConfigDict,
                 name: str):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)

        if not isinstance(obs_space, gym.spaces.Box) or len(obs_space.shape) != 2:
            raise ValueError(
                f"This model expects a 2D Box observation space (window_size, num_features). "
                f"Got {obs_space} with shape {obs_space.shape if hasattr(obs_space, 'shape') else 'N/A'}"
            )

        self.window_size = obs_space.shape[0]
        self.num_features = obs_space.shape[1]

        custom_cfg = model_config.get("custom_model_config", {})
        encoder_hidden_size = custom_cfg.get("encoder_hidden_size", 128)
        encoder_num_layers = custom_cfg.get("encoder_num_layers", 2)
        self.encoder_output_size = custom_cfg.get("encoder_output_size", 64)
        encoder_dropout_rate = custom_cfg.get("encoder_dropout_rate", 0.1)

        # Assuming MarketDataEncoder is defined in the global scope from a previous cell
        self.encoder = MarketDataEncoder(
            input_size=self.num_features,
            hidden_size=encoder_hidden_size,
            num_layers=encoder_num_layers,
            encoding_size=self.encoder_output_size,
            dropout_rate=encoder_dropout_rate # Ensure MarketDataEncoder uses this param name
        )

        policy_fc_size = custom_cfg.get("policy_fc_size", self.encoder_output_size)
        self.policy_fc = nn.Linear(self.encoder_output_size, policy_fc_size)
        self.policy_logits = nn.Linear(policy_fc_size, num_outputs)

        value_fc_size = custom_cfg.get("value_fc_size", self.encoder_output_size)
        self.value_fc = nn.Linear(self.encoder_output_size, value_fc_size)
        self.value_output = nn.Linear(value_fc_size, 1)

        self._last_encoded_features = None

        print(f"EncoderRLModel initialized: obs_space.shape=({self.window_size}, {self.num_features}), "
              f"action_space.shape={action_space.shape if hasattr(action_space, 'shape') else 'N/A'}, num_outputs={num_outputs}")
        print(f"MarketDataEncoder params: input_size={self.num_features}, hidden_size={encoder_hidden_size}, "
              f"num_layers={encoder_num_layers}, encoding_size={self.encoder_output_size}")

    @override(ModelV2)
    def forward(self,
                input_dict: Dict[str, TensorType],
                state: List[TensorType],
                seq_lens: TensorType) -> (TensorType, List[TensorType]):

        obs = input_dict["obs"]

        if len(obs.shape) == 2:
            batch_size = obs.shape[0]
            obs = obs.reshape(batch_size, self.window_size, self.num_features)
        elif len(obs.shape) == 3:
            pass
        else:
            raise ValueError(f"Unexpected observation shape: {obs.shape}. "
                             f"Expected (batch, window_size, num_features) or (batch, window_size * num_features).")

        encoded_features = self.encoder(obs)
        self._last_encoded_features = encoded_features

        x_policy = F.relu(self.policy_fc(encoded_features)) # Use F.relu
        logits = self.policy_logits(x_policy)

        return logits, []

    @override(ModelV2)
    def value_function(self) -> TensorType:
        assert self._last_encoded_features is not None, "Must call forward() first"
        x_value = F.relu(self.value_fc(self._last_encoded_features)) # Use F.relu
        value = self.value_output(x_value)
        return torch.squeeze(value, -1)


# --- How to use this model with RLlib ---
# 1. Register the custom model:
#    from ray.rllib.models import ModelCatalog
#    ModelCatalog.register_custom_model("encoder_rl_model", EncoderRLModel)

# 2. Configure your algorithm (e.g., PPO) to use it:
#    config = PPOConfig() # Assuming PPOConfig is imported
#    config = config.training(
#        model={
#            "custom_model": "encoder_rl_model",
#            "custom_model_config": {
#                "encoder_hidden_size": 256,
#                "encoder_num_layers": 2,
#                "encoder_output_size": 128,
#                "encoder_dropout_rate": 0.1, # Ensure this matches MarketDataEncoder's param
#                "policy_fc_size": 128,
#                "value_fc_size": 128,
#            },
#        }
#    )
#    # Ensure your environment's observation_space is correctly defined:
#    # e.g., self.observation_space = gym.spaces.Box(
#    #           low=-np.inf, high=np.inf, shape=(WINDOW_SIZE, NUM_FEATURES), dtype=np.float32
#    #      )

print("\nEncoderRLModel class defined. Ready for registration and use with RLlib.")
# Note: PPOConfig would need to be imported, e.g., from ray.rllib.algorithms.ppo import PPOConfig


EncoderRLModel class defined. Ready for registration and use with RLlib.


In [None]:
# Cell 6: Multi-Agent Market Environment Definition
# -------------------------------------------------
# This cell defines a multi-agent reinforcement learning environment for market
# regime prediction. It allows multiple agents to interact with the market data,
# receive observations (windows of market features), take actions, and receive rewards.
# The environment is designed to be compatible with RLlib's multi-agent API.

import numpy as np
import gymnasium as gym # Using gymnasium
from gymnasium import spaces # For action and observation space definitions
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.tune.registry import register_env

# Ensure 'df_features' and 'available_features' are defined from previous cells.
# These will be passed into the environment configuration.
# Example check:
# if 'df_features' not in globals() or 'available_features' not in globals():
#     raise NameError("Required DataFrames or feature lists are not defined. Please run data preparation cells.")

class MultiAgentMarketEnv(MultiAgentEnv):
    """
    Multi-agent environment for market regime prediction.

    Each agent receives a window of market data and must decide on an action
    (e.g., conservative or aggressive stance). Rewards are based on how well
    the action aligns with the true market regime.

    Configuration parameters expected:
    - df (pd.DataFrame): The DataFrame containing market data, including features and 'regime' column.
    - window_size (int): The number of time steps in each observation window.
    - feature_columns (list): A list of column names to be used as features.
    - reward_type (str, optional): Specifies the reward calculation logic. Defaults to "regime_prediction".
    - agent_ids (set, optional): A set of agent IDs. Defaults to {"agent_0", "agent_1"}.
    """

    # You can define metadata for RLlib if needed, e.g.
    # metadata = {'render.modes': ['human'], 'video.frames_per_second': 50}

    def __init__(self, config: dict):
        super().__init__()

        self.df = config["df"] # DataFrame with features and 'regime'
        self.window_size = config["window_size"]
        self.feature_columns = config["feature_columns"]
        self.reward_type = config.get("reward_type", "regime_prediction")

        # Define agent IDs. These must be unique strings.
        self._agent_ids = config.get("agent_ids", {"agent_0", "agent_1"})
        if not isinstance(self._agent_ids, set) or not all(isinstance(i, str) for i in self._agent_ids):
            raise ValueError("agent_ids must be a set of strings.")

        # Environment parameters
        self.current_step = 0
        # Start step ensures there's enough data for the first window_size
        self.start_step = self.window_size - 1
        self.end_step = len(self.df) - 1 # Last possible step index

        if self.start_step >= self.end_step:
            raise ValueError(
                f"DataFrame is too short for the given window_size. "
                f"Need at least {self.window_size} rows, got {len(self.df)}. "
                f"Start_step ({self.start_step}) >= end_step ({self.end_step})."
            )

        # Define action and observation spaces (these are per-agent)
        # Action: 0 for Conservative, 1 for Aggressive
        self.action_space = spaces.Discrete(2)

        # Observation: A window of market data
        # Shape: (window_size, num_features)
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(self.window_size, len(self.feature_columns)),
            dtype=np.float32
        )

        # For multi-agent, RLlib also expects these to be defined if they vary per agent
        # self._obs_space_in_preferred_format = True # If obs spaces are already dicts
        # self._action_space_in_preferred_format = True # If action spaces are already dicts
        # self.observation_space_dict = {agent_id: self.observation_space for agent_id in self._agent_ids}
        # self.action_space_dict = {agent_id: self.action_space for agent_id in self._agent_ids}

        print(f"MultiAgentMarketEnv initialized for agents: {self._agent_ids}")
        print(f"Observation space shape: {self.observation_space.shape}, Action space: {self.action_space}")

    def reset(self, *, seed: int = None, options: dict = None):
        """Resets the environment to an initial state and returns initial observations."""
        super().reset(seed=seed) # Important for seeding self.np_random

        # Determine starting step:
        # - If 'sequential' in options, start from the beginning (for evaluation).
        # - Otherwise, pick a random start point for training.
        if options and options.get("sequential", False):
            self.current_step = self.start_step
        else:
            # Ensure there are enough steps remaining for a meaningful episode (e.g., 100 steps)
            min_episode_len = options.get("min_episode_len", 100) if options else 100
            max_possible_start = self.end_step - min_episode_len

            # Ensure max_possible_start is not before self.start_step
            effective_max_start = max(self.start_step, max_possible_start)

            if self.start_step >= effective_max_start : # Handle cases where df is short
                 self.current_step = self.start_step
            else:
                 self.current_step = self.np_random.integers(self.start_step, effective_max_start + 1)

        initial_obs_single_agent = self._get_observation()

        # Provide initial observations for all agents
        observations = {
            agent_id: initial_obs_single_agent.copy() for agent_id in self._agent_ids
        }

        # Initial info dictionary (can be empty)
        infos = {agent_id: {} for agent_id in self._agent_ids}

        return observations, infos

    def step(self, action_dict: dict):
        """
        Executes one time step within the environment.
        Args:
            action_dict (dict): A dictionary mapping agent_id to its action.
        Returns:
            Tuple: observations, rewards, terminations, truncations, infos
        """
        # Ensure actions are provided for all active agents (can be more complex with agent presence)
        # For now, assume all agents in _agent_ids are always active

        current_row = self.df.iloc[self.current_step]
        current_regime = int(current_row['regime']) # Assuming 'regime' column exists and is numeric

        rewards = {}
        for agent_id, action in action_dict.items():
            if agent_id in self._agent_ids: # Process action only if agent is recognized
                rewards[agent_id] = self._calculate_reward(action, current_regime, current_row)
            # else: handle unexpected agent_id if necessary

        self.current_step += 1

        # Check for termination (end of data)
        # __all__ is a special key indicating termination for all agents
        terminated = self.current_step >= self.end_step
        terminations = {"__all__": terminated}

        # Truncation: can be used for time limits not related to task completion
        # For now, not using episode-specific truncation beyond end of data
        truncations = {"__all__": False}

        # Get next observations
        if not terminated:
            next_obs_single_agent = self._get_observation()
            observations = {
                agent_id: next_obs_single_agent.copy() for agent_id in self._agent_ids
            }
        else:
            # Provide a dummy observation if terminated (e.g., zeros)
            # This is important as RLlib might still expect an observation
            dummy_obs = np.zeros(self.observation_space.shape, dtype=np.float32)
            observations = {agent_id: dummy_obs for agent_id in self._agent_ids}

        # Information dictionary for each agent
        infos = {
            agent_id: {
                "current_regime": current_regime,
                "current_step": self.current_step,
                "is_terminated": terminated # Optional: signal termination in info
            } for agent_id in self._agent_ids
        }

        return observations, rewards, terminations, truncations, infos

    def _get_observation(self) -> np.ndarray:
        """
        Constructs the observation window for the current_step.
        The window ends at current_step (inclusive).
        """
        # Calculate start and end indices for the window
        # Window: [current_step - window_size + 1, current_step]
        start_idx = self.current_step - self.window_size + 1
        end_idx = self.current_step + 1 # Slicing is exclusive at the end

        if start_idx < 0:
            # Not enough data at the beginning, need to pad
            padding_size = abs(start_idx)
            # Create padding array of zeros
            padding = np.zeros((padding_size, len(self.feature_columns)), dtype=np.float32)
            # Get the available actual data
            actual_data = self.df[self.feature_columns].iloc[0:end_idx].values.astype(np.float32)
            # Concatenate padding and actual data
            window_data = np.vstack([padding, actual_data])
        else:
            # Sufficient data, directly slice from DataFrame
            window_data = self.df[self.feature_columns].iloc[start_idx:end_idx].values.astype(np.float32)

        # Ensure the final window_data has the correct shape
        if window_data.shape[0] != self.window_size:
            # This can happen if df is too short even with padding logic, or logic error
            raise ValueError(
                f"Observation window shape error. Expected {self.window_size} time steps, "
                f"got {window_data.shape[0]} at current_step {self.current_step} "
                f"(start_idx: {start_idx}, end_idx: {end_idx}). "
                f"DataFrame length: {len(self.df)}."
            )

        return window_data

    def _calculate_reward(self, action: int, regime: int, current_row: pd.Series) -> float:
        """
        Calculates the reward for a given agent's action based on the market regime.
        Args:
            action (int): The action taken by the agent (0 or 1).
            regime (int): The true market regime at the current step.
            current_row (pd.Series): The row of data for the current step (for potential future use).
        Returns:
            float: The calculated reward.
        """
        if self.reward_type == "regime_prediction":
            # Regime mapping: 1,2 (Bearish), 3 (Neutral), 4,5 (Bullish) based on MMD
            # Action: 0 (Conservative), 1 (Aggressive)

            is_bullish_regime = regime >= 4
            is_bearish_regime = regime <= 2
            is_neutral_regime = regime == 3

            agent_is_aggressive = action == 1
            agent_is_conservative = action == 0

            if agent_is_aggressive and is_bullish_regime:
                return 1.0  # Correct: Aggressive in Bullish
            elif agent_is_conservative and is_bearish_regime:
                return 1.0  # Correct: Conservative in Bearish
            elif is_neutral_regime: # Any action in neutral is okay, or small penalty/reward
                return 0.1  # Small positive reward for surviving neutral
            # Mismatches (penalties)
            elif agent_is_aggressive and is_bearish_regime:
                return -1.0 # Incorrect: Aggressive in Bearish
            elif agent_is_conservative and is_bullish_regime:
                return -1.0 # Incorrect: Conservative in Bullish
            else: # Should not be reached if regimes are 1-5
                return 0.0

        elif self.reward_type == "another_scheme":
            # Placeholder for a different reward calculation logic
            # Example: reward based on P&L if actions map to trades
            pass

        # Default reward if no scheme matches or for unhandled cases
        return 0.0

# --- Register the Environment with RLlib ---
# This allows RLlib to find and instantiate the environment using its string name.
# The lambda function passes the environment configuration to the constructor.
try:
    register_env("MultiAgentMarketEnv_v0", lambda config: MultiAgentMarketEnv(config))
    print("MultiAgentMarketEnv_v0 registered successfully with RLlib.")
except Exception as e:
    print(f"Error registering MultiAgentMarketEnv_v0: {e}")


MultiAgentMarketEnv_v0 registered successfully with RLlib.


In [None]:
# Cell 7: Multi-Agent PPO Training with Ray Tune (Rewritten)
# -----------------------------------------------------------
# This rewritten cell uses the modern `ray.tune.Tuner` API for a more
# robust and automated training and checkpointing workflow.

import ray
import pandas as pd
from ray import tune
from ray.air.config import RunConfig, CheckpointConfig
from ray.rllib.algorithms.ppo import PPO, PPOConfig
from ray.rllib.policy.policy import PolicySpec
from ray.tune.registry import register_env
from ray.rllib.models import ModelCatalog

# --- Prerequisite Checks and Registrations ---
# This section ensures that your custom classes are registered with Ray.
try:
    if 'MultiAgentMarketEnv' not in globals():
        raise NameError("MultiAgentMarketEnv class not found. Please run Cell 6.")
    register_env("MultiAgentMarketEnv_v0", lambda config: MultiAgentMarketEnv(config))
    print("Custom environment 'MultiAgentMarketEnv_v0' registered successfully.")

    if 'MultiAgentPolicy' not in globals():
        raise NameError("MultiAgentPolicy class not found. Please run Cell 5.")
    ModelCatalog.register_custom_model("MultiAgentPolicy", MultiAgentPolicy)
    print("Custom model 'MultiAgentPolicy' registered successfully.")
except NameError as ne:
    print(f"Registration Error: {ne}")
    raise
except Exception as e:
    print(f"An unexpected error occurred during registration: {e}")
    raise

# --- Initialize Ray ---
if ray.is_initialized():
    print("Shutting down existing Ray instance...")
    ray.shutdown()
ray.init(num_cpus=4, ignore_reinit_error=True, include_dashboard=False, logging_level="ERROR")
print("Ray initialized successfully.")

# --- PPO Algorithm Configuration ---
print("\nConfiguring PPO algorithm...")
config = (
    PPOConfig()
    .environment(
        env="MultiAgentMarketEnv_v0",
        env_config={
            "df": train_df,
            "window_size": 30,
            "feature_columns": available_features,
            "reward_type": "regime_prediction",
            "agent_ids": {"agent_0", "agent_1"}
        }
    )
    .framework("torch")
    .rollouts(num_rollout_workers=2, rollout_fragment_length=500)
    .training(
        model={
            "custom_model": "MultiAgentPolicy",
            "custom_model_config": {
                "encoding_size": 128,
                "hidden_size": 256,
                "num_lstm_layers": 2,
            }
        },
        lr=5e-5,
        gamma=0.99,
        lambda_=0.95,
        clip_param=0.2,
        vf_loss_coeff=0.5,
        entropy_coeff=0.01,
        train_batch_size=1000,
        sgd_minibatch_size=128,
        num_sgd_iter=10
    )
    .multi_agent(
        policies={"shared_policy": PolicySpec()},
        policy_mapping_fn=lambda agent_id, episode, worker, **kwargs: "shared_policy",
    )
    .evaluation(
        evaluation_interval=5,
        evaluation_duration=10,
        evaluation_duration_unit="episodes",
        evaluation_num_workers=1,
        evaluation_config={
            "env_config": {
                "df": val_df,
                "window_size": 30,
                "feature_columns": available_features,
                "reward_type": "regime_prediction",
                "agent_ids": {"agent_0", "agent_1"},
                "sequential": True
            },
        },
    )
    .resources(num_gpus=0)
    .debugging(log_level="WARN")
)
print("PPO configuration created successfully.")

# --- Configure and Run the Tuner ---
# This replaces the manual training loop for a more robust workflow.
print("\nSetting up Ray Tune Tuner...")

# 1. Define the checkpoint configuration to automatically save the best model.
checkpoint_config = CheckpointConfig(
    num_to_keep=1,
    checkpoint_score_attribute="evaluation/episode_reward_mean",
    checkpoint_score_order="max",
    checkpoint_at_end=True
)

# 2. Define the RunConfig to specify stopping criteria and checkpointing.
run_config = RunConfig(
    name="PPO_Training_Run",
    stop={"training_iteration": 50},
    checkpoint_config=checkpoint_config,
    verbose=1, # Prints a summary table of results
)

# 3. Create the Tuner object.
tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=run_config,
)

# 4. Run the training.
print("Starting training with Ray Tune...")
results = tuner.fit()
print("Training complete!")

# --- Post-Training: Load Best Model and History ---
# This section prepares the `algo` and `history_df` objects for the next cell.
print("\nLoading best model from training run for evaluation...")
algo = None
history_df = None
try:
    best_result = results.get_best_result(metric="evaluation/episode_reward_mean", mode="max")

    if best_result and best_result.checkpoint:
        print(f"Best checkpoint found at: {best_result.checkpoint.path}")
        algo = PPO.from_checkpoint(best_result.checkpoint)
        print("✅ Trained `algo` object is restored and ready for evaluation.")

        history_df = results.get_dataframe()
        history_df.rename(columns={
            "evaluation/episode_reward_mean": "eval_episode_reward_mean",
            "episode_reward_mean": "train_episode_reward_mean"
        }, inplace=True)
        print("✅ `history_df` is created and ready for plotting.")
    else:
        raise ValueError("No checkpoint found in the best result. Training may have failed to save a model.")

except Exception as e:
    print(f"--- ERROR loading results ---")
    print(f"Could not load the best model from the training run: {e}")


Custom environment 'MultiAgentMarketEnv_v0' registered successfully.
Registration Error: MultiAgentPolicy class not found. Please run Cell 5.


NameError: MultiAgentPolicy class not found. Please run Cell 5.

In [None]:
# --- Final, Most Robust Cell (Cell 7.5) ---
# This version makes no assumptions about trial directory names.

import os
import pandas as pd
from ray.rllib.algorithms.ppo import PPO # IMPORTANT: Assuming PPO is your algorithm

print("Attempting to load latest training results from disk...")

# --- Configuration ---
ray_results_dir = os.path.expanduser("~/ray_results")
experiment_name = None # Let the script find the latest
# --- End of Configuration ---

algo = None
history_df = pd.DataFrame()
experiment_path = "" # Initialize to avoid reference errors

try:
    if not os.path.exists(ray_results_dir):
        raise FileNotFoundError(f"Ray results directory not found at: {ray_results_dir}")

    # Find the latest experiment directory
    if experiment_name:
        experiment_path = os.path.join(ray_results_dir, experiment_name)
    else:
        all_exp_dirs = [d for d in os.listdir(ray_results_dir) if os.path.isdir(os.path.join(ray_results_dir, d))]
        if not all_exp_dirs:
            raise FileNotFoundError("No experiment directories found in `ray_results`.")
        latest_exp_name = max(all_exp_dirs, key=lambda d: os.path.getmtime(os.path.join(ray_results_dir, d)))
        experiment_path = os.path.join(ray_results_dir, latest_exp_name)
        print(f"Found latest experiment: '{latest_exp_name}'")

    print(f"Searching for trial data in: {experiment_path}")

    # --- FINAL ROBUST LOADING LOGIC ---
    # 1. Find the trial directory by looking for ANY subdirectory.
    all_subdirs = [os.path.join(experiment_path, d) for d in os.listdir(experiment_path) if os.path.isdir(os.path.join(experiment_path, d))]
    if not all_subdirs:
        raise FileNotFoundError(f"No trial subdirectories were found inside {experiment_path}")

    # Assume the most recently modified subdirectory is the correct trial path.
    trial_path = max(all_subdirs, key=os.path.getmtime)
    print(f"Found latest trial path: {trial_path}")

    # 2. Find the latest checkpoint directory within the trial path.
    checkpoint_dirs = [d for d in os.listdir(trial_path) if d.startswith("checkpoint_")]
    if not checkpoint_dirs:
        raise FileNotFoundError(f"No checkpoint folders (e.g., 'checkpoint_*') found in trial path: {trial_path}")

    latest_checkpoint_name = sorted(checkpoint_dirs, key=lambda c: int(c.split('_')[1]), reverse=True)[0]
    latest_checkpoint_path = os.path.join(trial_path, latest_checkpoint_name)

    print(f"Loading latest checkpoint from: {latest_checkpoint_path}")

    # 3. Restore the algorithm ('algo') from this checkpoint.
    algo = PPO.from_checkpoint(latest_checkpoint_path)
    print("Algorithm restored successfully.")

    # 4. Load the training history ('history_df') from 'progress.csv'.
    history_file = os.path.join(trial_path, "progress.csv")
    if os.path.exists(history_file):
        history_df = pd.read_csv(history_file)
        print("Training history DataFrame loaded.")
    else:
        print("Warning: progress.csv not found. Training history plots will be unavailable.")

except (FileNotFoundError, ValueError, IndexError) as e:
    print(f"--- ERROR ---")
    print(e)
    print("\nTroubleshooting:")
    print("1. Ensure your training (Cell 7) completed and created checkpoint files.")
    print(f"2. Manually check the contents of your latest experiment directory: {experiment_path}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Attempting to load latest training results from disk...
Found latest experiment: 'PPO_MultiAgentMarketEnv_v0_2025-06-10_19-07-12huprxdh7'
Searching for trial data in: C:\Users\Windows 11/ray_results\PPO_MultiAgentMarketEnv_v0_2025-06-10_19-07-12huprxdh7
--- ERROR ---
No trial subdirectories were found inside C:\Users\Windows 11/ray_results\PPO_MultiAgentMarketEnv_v0_2025-06-10_19-07-12huprxdh7

Troubleshooting:
1. Ensure your training (Cell 7) completed and created checkpoint files.
2. Manually check the contents of your latest experiment directory: C:\Users\Windows 11/ray_results\PPO_MultiAgentMarketEnv_v0_2025-06-10_19-07-12huprxdh7


In [None]:
# Cell 8: Evaluation and Visualization (Corrected)

# First, check if the 'algo' object was successfully created in the previous cell
if 'algo' in locals() and algo:
    # --- Evaluation function (as you provided) ---
    def evaluate_multi_agent(algo, eval_df, num_episodes=10):
        """Evaluate the trained multi-agent system"""
        env_config = {
            "df": eval_df,
            "window_size": 30,
            "feature_columns": available_features,
            "reward_type": "regime_prediction"
        }
        env = MultiAgentMarketEnv(env_config)

        all_rewards = []
        all_predictions = []
        all_true_regimes = []

        for episode in range(num_episodes):
            obs, _ = env.reset(options={"sequential": True})
            episode_reward = 0
            done = {"__all__": False}

            while not done["__all__"]:
                actions = {}
                for agent_id in env._agent_ids:
                    actions[agent_id] = algo.compute_single_action(
                        obs[agent_id],
                        policy_id="shared_policy"
                    )
                obs, rewards, done, truncated, infos = env.step(actions)
                episode_reward += rewards["agent_0"]
                if "agent_0" in infos:
                    all_predictions.append(actions["agent_0"])
                    all_true_regimes.append(infos["agent_0"]["current_regime"])

            all_rewards.append(episode_reward)

        correct_predictions = sum(
            (pred == 1 and true >= 4) or (pred == 0 and true <= 2)
            for pred, true in zip(all_predictions, all_true_regimes)
        )
        accuracy = correct_predictions / len(all_predictions) if all_predictions else 0

        return {
            'mean_reward': np.mean(all_rewards),
            'std_reward': np.std(all_rewards),
            'accuracy': accuracy,
            'predictions': all_predictions,
            'true_regimes': all_true_regimes
        }

    # --- Run Evaluation ---
    print("Evaluating the trained agent on the test set...")
    test_results = evaluate_multi_agent(algo, test_df, num_episodes=20)

    print(f"\n--- Test Set Results ---")
    print(f"Mean Reward: {test_results['mean_reward']:.3f} ± {test_results['std_reward']:.3f}")
    print(f"Regime Prediction Accuracy: {test_results['accuracy']:.3%}")

    # --- Visualization ---
    if 'history_df' in locals() and not history_df.empty:
        print("\nVisualizing training history...")
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)
        ax1.plot(history_df['iteration'], history_df['episode_reward_mean'], label='Mean Reward')
        ax1.set_ylabel('Episode Reward Mean')
        ax1.set_title('Training Progress')
        ax1.grid(True, alpha=0.3)
        ax2.plot(history_df['iteration'], history_df['episode_len_mean'], label='Mean Episode Length')
        ax2.set_xlabel('Training Iteration')
        ax2.set_ylabel('Episode Length Mean')
        ax2.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

    # --- Confusion Matrix ---
    print("\nGenerating confusion matrix...")
    from sklearn.metrics import confusion_matrix
    y_true_binary = [0 if r > 3 else 1 for r in test_results['true_regimes']] # 0=Bullish, 1=Bearish
    y_pred_binary = [0 if p == 1 else 1 for p in test_results['predictions']] # 0=Bullish, 1=Bearish
    cm = confusion_matrix(y_true_binary, y_pred_binary)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Bullish', 'Bearish'], yticklabels=['Bullish', 'Bearish'])
    plt.title('Market Regime Prediction Confusion Matrix')
    plt.ylabel('True Regime')
    plt.xlabel('Predicted Action')
    plt.show()

else:
    print("SKIPPING EVALUATION: The 'algo' object was not created. Please run the cell above to load the trained model from disk.")

SKIPPING EVALUATION: The 'algo' object was not created. Please run the cell above to load the trained model from disk.
