## Define Constants

In [None]:
import torch

# Hyperparameters
MAX_MEMORY = 100_000
BATCH_SIZE = 128
LEARNING_RATE = 0.25
GAMMA = 0.95
EPSILON_DECAY = 0.99
MIN_EPSILON = 0.01

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    torch.backends.cudnn.benchmark = True # Optimize CUDA operations

## Define Environment

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from enum import Enum
from typing import Tuple, List, Optional

class Action(Enum):
    HOLD = 0
    LONG = 1
    SHORT = 2
    FLATTEN = 3

class TradeSide(Enum):
    LONG = 0
    SHORT = 1

class Position:
    def __init__(self, entry_price: float, size: int, side: TradeSide):
        self.entry_price = entry_price
        self.size = size
        self.side = side

    def get_unrealized_pnl(self, current_price: float) -> float:
        diff = (current_price - self.entry_price) if self.side == TradeSide.LONG else (self.entry_price - current_price)
        return self.entry_price * self.size * (diff / ((self.entry_price + current_price) / 2) / 100)

    def close(self, current_price: float) -> float:
        return self.get_unrealized_pnl(current_price)

class TradingEnvironment:
    REQUIRED_FEATURES = [
        "Asset1_Price", "Asset2_Price", "Ratio_Price", "Spread_ZScore", "Rolling_Correlation",
        "Rolling_Cointegration_Score", "RSI1", "RSI2", "RSI3",
        "MACD1", "MACD2", "MACD3"
    ]

    PERFORMANCE_COLUMNS = ["Unrealized_PnL", "Realized_PnL", "Positioned"]

    def __init__(self, df: pd.DataFrame, plot: bool = False, debug: bool = False):
        self.step = 0
        self.window_size = 1
        self.debug = debug
        self.plot = plot
        self.balance = self.equity = 1_000_000
        self.positions: List[Tuple[Position, Position]] = []
        self.realized_pnl = self.unrealized_pnl = 0
        self.trade_history, self.reward_history = [], []
        self.buy_signals, self.sell_signals = [], []
        self.last_trade_step = 0
        self._prepare_data(df)

    def _prepare_data(self, df: pd.DataFrame):
        missing_features = [f for f in self.REQUIRED_FEATURES if f not in df.columns]
        if missing_features:
            raise ValueError(f"Missing features: {missing_features}")

        self.data = df.copy()[self.REQUIRED_FEATURES]
        for col in self.PERFORMANCE_COLUMNS:
            self.data[col] = 0

        if "Hedge_Ratio" not in df.columns:
            raise ValueError("Missing 'Hedge_Ratio' column")

        self.hedge_ratio = df["Hedge_Ratio"]

    def reset(self) -> pd.Series:
        self.step = self.realized_pnl = self.unrealized_pnl = 0
        self.balance = self.equity = 1_000_000
        self.positions.clear()
        self.trade_history.clear()
        self.reward_history.clear()
        self.buy_signals.clear()
        self.sell_signals.clear()
        self.last_trade_step = 0
        return self.current_observation()

    def step_forward(self, action: Action) -> Tuple[pd.Series, float, float, float, bool]:
        self.step += 1
        self.last_trade_step += 1

        reward, profit, positioned = self.execute_trade(action)

        self.trade_history.append(self.realized_pnl)
        self.reward_history.append(reward)

        # Fixed: Added debugging and proper None check
        if self.data is None:
            if self.debug:
              print("DEBUG: self.data is None in step_forward")
            done = True
        else:
            done = self.step + self.window_size >= len(self.data)

        # Get a copy of the observation to avoid SettingWithCopyWarning
        obs = self.current_observation()

        # Update the copy with performance metrics
        obs.update({"Unrealized_PnL": self.unrealized_pnl, "Realized_PnL": self.realized_pnl, "Positioned": positioned})

        return obs, reward, self.realized_pnl, self.unrealized_pnl, done

    def execute_trade(self, action: Action) -> Tuple[float, float, int]:
      asset1_price = self.data.iloc[self.step]["Asset1_Price"]
      asset2_price = self.data.iloc[self.step]["Asset2_Price"]
      hedge_ratio = self.hedge_ratio.iloc[self.step]

      profit = reward = 0

      if action == Action.HOLD:
          self.update_unrealized_pnl(asset1_price, asset2_price)

      elif action in (Action.LONG, Action.SHORT) and not self.positions:
          size = 100_000

          # Fixed: Added missing closing parenthesis
          positions = (
              Position(asset1_price, size, TradeSide.SHORT if action == Action.LONG else TradeSide.LONG),
              Position(asset2_price, int(size * hedge_ratio), TradeSide.LONG if action == Action.LONG else TradeSide.SHORT)
          )  # Added closing parenthesis

          self.positions.append(positions)

          (self.buy_signals if action == Action.LONG else self.sell_signals).append(self.step)
          self.last_trade_step = 0

      elif action == Action.FLATTEN and self.positions:
          pnl = sum(pos_a.close(asset1_price) + pos_b.close(asset2_price) for pos_a, pos_b in self.positions)
          self.realized_pnl += pnl
          self.unrealized_pnl = 0
          self.positions.clear()

          profit = pnl
          reward = self.calculate_reward(pnl)  # This might return None if not implemented
          if reward is None:
              print("DEBUG: calculate_reward returned None")
              reward = pnl  # Default to using pnl as reward

      return reward, profit, int(bool(self.positions))

    def update_unrealized_pnl(self, asset1_price: float, asset2_price: float):
      # Fixed: Added closing parenthesis and None check
      if not self.positions:
          self.unrealized_pnl = 0
          return

      self.unrealized_pnl = sum(
          pos_a.get_unrealized_pnl(asset1_price) + pos_b.get_unrealized_pnl(asset2_price)
          for pos_a, pos_b in self.positions
      )  # Added closing parenthesis

    def calculate_reward(self, pnl: float):
      """
      Calculate reward based on PnL with proper return value.
      """
      # Implement a basic reward function if it's not already defined
      if pnl is None:
          return 0.0
      return float(pnl)  # Ensure we return a float value

    def current_observation(self) -> pd.Series:
        # Fixed: Added error handling and debug output
        if self.data is None:
            print("DEBUG: self.data is None in current_observation")
            # Return empty Series with required columns
            return pd.Series({feature: 0 for feature in self.REQUIRED_FEATURES + self.PERFORMANCE_COLUMNS})
        elif self.step >= len(self.data):
            print(f"DEBUG: Step index {self.step} is out of bounds for data length {len(self.data)}")
            # Return last valid observation
            return self.data.iloc[-1].copy()
        else:
            # Return a copy instead of a view to avoid the SettingWithCopyWarning
            return self.data.iloc[self.step].copy()


## Define Neural Network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional a5s F
import os
import numpy as np

class LSTM_Q_Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.3, bidirectional=False):
        super(LSTM_Q_Net, self).__init__()
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional

        # Fixed: Added missing closing parenthesis
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=2, # Reduced from 3 for faster processing
            batch_first=True,
            dropout=dropout,
            bidirectional=bidirectional,
        )  # Added closing parenthesis here

        mult = 2 if bidirectional else 1
        self.ln = nn.LayerNorm(hidden_size * mult)
        self.fc1 = nn.Linear(hidden_size * mult, 128)
        self.fc2 = nn.Linear(128, output_size)

    def forward(self, x):
        # Handle both single samples and batches efficiently
        if len(x.shape) == 2:
            x = x.unsqueeze(1) # Add sequence dimension if missing

        lstm_out, _ = self.lstm(x)

        # Use the last timestep output
        last_step = lstm_out[:, -1, :]
        normalized = self.ln(last_step)

        # Fully connected layers
        x = F.relu(self.fc1(normalized))
        return self.fc2(x)

    def save(self, file_name):
        model_folder_path = "./models"
        os.makedirs(model_folder_path, exist_ok=True)
        file_path = os.path.join(model_folder_path, file_name)
        torch.save(self.state_dict(), file_path)

## Define Trainer

In [None]:
class QTrainer:
    def __init__(self, model, lr, gamma, batch_size, target_update_freq=50):
        self.lr = lr
        self.gamma = gamma
        self.model = model.to(device)

        # Create target network with same architecture
        self.target_model = LSTM_Q_Net(
            input_size=15,
            hidden_size=model.hidden_size,
            output_size=model.fc2.out_features,
            bidirectional=model.bidirectional
        ).to(device)

        self.update_target()

        # Use Adam with improved parameters
        self.optimizer = optim.Adam(model.parameters(), lr=self.lr, amsgrad=True)

        # Less frequent LR adjustments
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=20, gamma=0.5)

        # Huber loss (SmoothL1Loss) is more robust for RL
        self.criterion = nn.SmoothL1Loss()

        self.batch_size = batch_size
        self.target_update_freq = target_update_freq
        self.train_step_count = 0

    def update_target(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def train_step(self, state, action, reward, next_state, done):
        # Reshape action for gathering
        action = action.view(-1, 1)

        # Get current Q values
        pred = self.model(state)

        # Implement Double Q-learning for stability
        with torch.no_grad():
            # Get actions from main network
            next_actions = self.model(next_state).argmax(dim=1, keepdim=True)

            # Get Q-values from target network
            next_q_values = self.target_model(next_state).gather(1, next_actions)

            # Compute target Q values
            target = reward.unsqueeze(1) + (1 - done.float().unsqueeze(1)) * self.gamma * next_q_values

        # Get Q values for taken actions
        q_values = pred.gather(1, action)

        # Calculate loss and optimize
        loss = self.criterion(q_values, target)
        self.optimizer.zero_grad()
        loss.backward()

        # Apply gradient clipping
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)

        self.optimizer.step()

        # Update target network less frequently
        self.train_step_count += 1
        if self.train_step_count % self.target_update_freq == 0:
            self.update_target()
            self.scheduler.step()

## Define Training Agent

In [None]:
class TradingAgent:
    def __init__(self, data: pd.DataFrame, plot: bool = False, debug: bool = False):
        self.debug = debug
        self.epsilon = 0.25
        self.memory = deque(maxlen=MAX_MEMORY)
        self.model = LSTM_Q_Net(input_size=15, hidden_size=128, output_size=4).to(device)
        self.trainer = QTrainer(self.model, lr=LEARNING_RATE, gamma=GAMMA, batch_size=BATCH_SIZE)
        self.env = TradingEnvironment(data, plot, debug)
        self.length = len(data)
        self.equity_curves = {}

    def store_experience(self, state, action, reward, next_state, done):
      if reward is None or np.isnan(reward):  # Handle None or NaN rewards
          print("DEBUG: Received None or NaN reward, setting to 0.0")
          reward = 0.0
      else:
          reward = np.clip(reward, -1, 1)

      self.memory.append((state, action, reward, next_state, done))


    def sample_experiences(self):
        if len(self.memory) < BATCH_SIZE:
            return list(self.memory)  # Return whatever is available

        priorities = np.abs(np.array([exp[2] for exp in self.memory]))  # Absolute rewards
        sum_priorities = np.sum(priorities)

        # Ensure priorities are non-zero by adding a small constant
        if sum_priorities == 0:
            probabilities = np.ones(len(self.memory)) / len(self.memory)  # Uniform sampling
        else:
            probabilities = (priorities + 1e-6) / (sum_priorities + 1e-6)  # Avoid division by zero

        # FIX: Normalize probabilities to ensure they sum to exactly 1
        probabilities = probabilities / np.sum(probabilities)

        sample_size = min(BATCH_SIZE, len(self.memory))
        indices = np.random.choice(len(self.memory), sample_size, p=probabilities, replace=False)

        return [self.memory[i] for i in indices]

    def update_epsilon(self):
        self.epsilon = max(MIN_EPSILON, self.epsilon * EPSILON_DECAY)

    @torch.no_grad() # Disable gradient tracking for inference
    def select_action(self, state: pd.Series) -> Tuple[Action, int]:
        # Epsilon-greedy action selection
        if np.random.random() < self.epsilon:
            action_idx = np.random.randint(0, 4)
        else:
            # Single conversion to tensor with proper handling of NaN values
            state_tensor = torch.tensor(np.nan_to_num(state.values),
                                      dtype=torch.float32).unsqueeze(0).to(device)
            action_idx = torch.argmax(self.model(state_tensor)).item()

        return [Action.HOLD, Action.LONG, Action.SHORT, Action.FLATTEN][action_idx], action_idx

    def train(self):
        """
        Trains the model using a mini-batch from experience replay.
        """
        batch = self.sample_experiences()
        states, actions, rewards, next_states, dones = zip(*batch)
        self.trainer.train_step(
            torch.tensor(np.array(states), dtype=torch.float32).to(device),
            torch.tensor(actions, dtype=torch.long).to(device),
            torch.tensor(rewards, dtype=torch.float32).to(device),
            torch.tensor(np.array(next_states), dtype=torch.float32).to(device),
            torch.tensor(dones, dtype=torch.bool).to(device)
        )  # Fixed: Added missing closing parenthesis

    def save_model(self, episode: int):
      """
      Saves the trained model with proper error handling.
      """
      try:
          if episode is not None:
              print(f"DEBUG: Saving model for episode {episode}")
              self.model.save(f"Episode-{episode}.pth")
          else:
              print("DEBUG: Episode variable is None, using fallback name")
              self.model.save("Episode-unknown.pth")
      except Exception as e:
          print(f"DEBUG: Error in save_model - {str(e)}")

    def run(self, episodes: int):
        progress = tqdm(total=self.length)
        for episode in range(episodes):
            state, done = self.env.reset(), False
            equity_curve = []
            while not done:
                progress.update(1)
                action, action_idx = self.select_action(state)
                next_state, reward, real_profit, _, done = self.env.step_forward(action)
                equity_curve.append(real_profit)

                # Store experience using NumPy arrays directly
                self.store_experience(state.values, action_idx, reward, next_state.values, done)
                self.train()
                state = next_state

            self.equity_curves[episode] = equity_curve
            self.update_epsilon()
            self.save_model(episode)  # Now properly handles None case

## Initialize

In [None]:
data = pd.read_csv('data/processed')

agent = TradingAgent()

## Run Training Algorithm