# Imports

In [None]:
# Gym & Gym_Anytrading
import gym
import gym.vector
import gym_anytrading

# Pytorch Modules
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast

# Data Structures
import numpy as np
import pandas as pd
from collections import deque

# Misc
import os

# Data Stuff

### Import data using Pandas
```python
eData = pd.read_csv("data/EURUSDp.csv")
...
```

### Format datetime
```python
eData["Gmt time"] = pd.to_datetime(eData["Gmt time"], format="%Y-%m-%d %H:%M:%S")
...
```

### Set Index
```python
eData.set_index("Gmt Time", inplace=True)
...
```

In [None]:
eData = pd.read_csv("data/EURUSDp.csv") # make sure these have correct column names
pData = pd.read_csv("data/GBPUSDp.csv") # make sure these have correct column names
xData = pd.read_csv("data/processed/EXTRAp.csv") # make sure these have correct column names

eData["Gmt time"] = pd.to_datetime(eData["Gmt time"], format="%Y-%m-%d %H:%M:%S")
pData["Gmt time"] = pd.to_datetime(pData["Gmt time"], format="%Y-%m-%d %H:%M:%S")
xData["Gmt time"] = pd.to_datetime(xData["Gmt time"], format="%Y-%m-%d %H:%M:%S")

eData.set_index("Gmt Time", inplace=True)
pData.set_index("Gmt Time", inplace=True)
xData.set_index("Gmt Time", inplace=True)

# Environment Functions

### Reward Function

In [None]:
def reward_function():
  pass

### Dynamic Enviroment Data

In [None]:
def dynamic_feature_positioning(history):
  return history

def dynamic_feature_unrealized(history):
  return history

def dynamic_feature_realized(history):
  return history

### make_function(n: int)
takes in an ID number `n` then return an environment with the correct information

In [None]:
def make_function(n: int):
  match n:
    case 1:
      return gym.make(
        id="forex-v0",
        name="EUR/USD",
        df=eData,
        dynamic_feature_functions = [],
        reward_function=reward_function,
        verbose=True
       )
    case 2:
      return gym.make(
        id="forex-v0",
        name="EUR/USD",
        df=pData,
        dynamic_feature_functions = [],
        reward_function=reward_function,
        verbose=True
       )
    case 3:
      return gym.make(
        id="forex-v0",
        name="EXTRA",
        df=xData,
        dynamic_feature_functions = [],
        reward_function=reward_function,
        verbose=True
       )

# LSTM Deep Q Learning Model
Pytorch LSTM Model for Deep Q Learning

## Constructor
```python
LSTM_Q_Net(input_size: int, hidden_size: int, output_size: int)
```
### Inputs
- `input_size`: Number of input features
- `hidden_size`: Number of input features
- `output_size`: Number of input features

## Overview
```python
forward(x: torch.tensor)

save(file_name: str)
```


In [None]:
class LSTM_Q_Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM_Q_Net, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
        )
        self.fc2 = nn.Linear(hidden_size, output_size)
        self._init_weights()

    def forward(self, x):
      out, hidden = self.lstm(x, hidden)
      out = self.fc(out[:, -1, :])
      return out, hidden

    def _init_weights(self):
        for name, param in self.named_parameters():
            if 'weight' in name:
                if 'lstm' in name:
                    nn.init.xavier_uniform_(param)
                else:
                    nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.zeros_(param)

    def save(self, file_name):
        model_folder_path = "./models"
        os.makedirs(model_folder_path, exist_ok=True)
        file_path = os.path.join(model_folder_path, file_name)
        torch.save(self.state_dict(), file_path)

# Training Agent

In [None]:
class TrainingAgent():
    def __init__(self, envs, hidden_size=128, learning_rate=1e-3,
                 gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995,
                 seq_len=32, batch_size=64, num_layers=1, memory_size=10000):

        self.envs = envs
        self.input_size = np.prod(self.envs.observation_space.shape)
        self.output_size = self.envs.action_space.n

        # Model parameters
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Initialize models
        self.model = LSTM_Q_Net(self.input_size, hidden_size, self.output_size).to(self.device)
        self.target_model = LSTM_Q_Net(self.input_size, hidden_size, self.output_size).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())

        # Training parameters
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.scaler = torch.amp.GradScaler(device=self.device.type)

        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        # Memory and sequence parameters
        self.memory = deque(maxlen=memory_size)
        self.seq_len = seq_len
        self.batch_size = batch_size

    def train(self, num_episodes=1000, target_update_frequency=10, checkpoint_frequency=100,
              render=False, checkpoint_path="model_checkpoints"):
        """
        Train the agent for a specified number of episodes

        Args:
            num_episodes: Number of episodes to train for
            target_update_frequency: How often to update the target network
            checkpoint_frequency: How often to save model checkpoints
            render: Whether to render the environment
            checkpoint_path: Where to save model checkpoints
        """
        total_rewards = []

        for episode in range(1, num_episodes + 1):
            episode_reward = self.run_episode(render)
            total_rewards.append(episode_reward)

            # Update target network periodically
            if episode % target_update_frequency == 0:
                self.target_model.load_state_dict(self.model.state_dict())

            # Save checkpoint periodically
            if episode % checkpoint_frequency == 0:
                import os
                os.makedirs(checkpoint_path, exist_ok=True)
                torch.save({
                    'episode': episode,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'reward': episode_reward,
                    'epsilon': self.epsilon
                }, f"{checkpoint_path}/checkpoint_{episode}.pt")

            # Print progress
            self.envs.save_for_render(dir = f"render_logs_{episode}")
            if episode % 10 == 0:
                avg_reward = sum(total_rewards[-10:]) / 10
                print(f"Episode {episode}/{num_episodes}, Average Reward (Last 10): {avg_reward:.2f}, Epsilon: {self.epsilon:.2f}")

        print("Training complete!")
        return total_rewards

    def run_episode(self, render=False):
        """Run a single episode and return the total reward"""
        state = self.envs.reset()
        state0, state1, state2 = self.expand(state)
        hidden_state = (torch.zeros(self.num_layers, 1, self.hidden_size, dtype=torch.float16).to(self.device),
                       torch.zeros(self.num_layers, 1, self.hidden_size, dtype=torch.float16).to(self.device))
        episode_reward = 0
        episode_memory = deque(maxlen=self.seq_len)
        done = False

        while not done:
            if render:
                self.envs.render()

            state_tensor = torch.tensor(state2, dtype=torch.float32).to(self.device).unsqueeze(0)

            # Epsilon-greedy action selection
            if np.random.rand() <= self.epsilon:
                action = self.envs.action_space.sample()
            else:
                with torch.no_grad(), autocast(device_type=self.device.type, dtype=torch.float16):
                    q_values, hidden_state = self.model(state_tensor, hidden_state)
                    action = torch.argmax(q_values, dim=1).item()

            # Take action and observe next state
            next_state, reward, done, _ = self.envs.step(action)
            next_state0, next_state1, next_state2 = self.expand(next_state)
            episode_reward += reward

            # Store experience in episode memory
            episode_memory.append((state2, action, reward, next_state2, done, hidden_state))

            # Update state
            state0, state1, state2 = next_state0, next_state1, next_state2

            # Train on episode memory if enough experience is collected or episode is done
            if len(episode_memory) >= self.seq_len or done:
                self.train_on_batch(episode_memory)
                episode_memory = deque(maxlen=self.seq_len)

        # Decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

        return episode_reward

    def train_on_batch(self, memory):
        """Train the model on a batch of experiences"""
        if len(memory) == 0:
            return

        # Use truncated backpropagation through time (tbtt)
        states, next_states, hidden_states, actions, rewards, dones = self.tbtt(memory)

        # Compute target Q values
        with torch.no_grad(), autocast(device_type=self.device.type, dtype=torch.float16):
            target_q_values, _ = self.target_model(next_states, hidden_states)
            max_next_q_values = torch.max(target_q_values, dim=1, keepdim=True)[0]
            target_q = rewards + (1 - dones) * self.gamma * max_next_q_values

        # Compute current Q values and loss
        self.optimizer.zero_grad()
        with autocast(device_type=self.device.type, dtype=torch.float16):
            current_q_values, _ = self.model(states, hidden_states)
            current_q_values = current_q_values.gather(1, actions)
            loss = self.criterion(current_q_values, target_q)

        # Backpropagate loss with mixed precision
        self.scaler.scale(loss).backward()
        self.scaler.step(self.optimizer)
        self.scaler.update()

    @staticmethod
    def tbtt(memory):
        """Prepare batch for truncated backpropagation through time"""
        if not memory:
            return None, None, None, None, None, None

        states, actions, rewards, next_states, dones, hidden_states = zip(*memory)
        device = next(iter(hidden_states[0][0].parameters())) if hidden_states else torch.device("cpu")

        states = torch.tensor(np.array(states), dtype=torch.float32).to(device)
        actions = torch.tensor(actions, dtype=torch.int64).to(device).unsqueeze(1)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device).unsqueeze(1)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).to(device)
        dones = torch.tensor(dones, dtype=torch.float32).to(device).unsqueeze(1)

        # Process hidden states
        h_states = (torch.cat([h[0] for h in hidden_states], dim=1),
                   torch.cat([h[1] for h in hidden_states], dim=1))

        return states, next_states, h_states, actions, rewards, dones

    @staticmethod
    def expand(state):
        """Expand state into three components for processing"""
        if isinstance(state, np.ndarray):
            return np.expand_dims(state, axis=0), np.expand_dims(state, axis=0), state
        else:
            # Handle case where state might be a different format
            state_array = np.array(state)
            return np.expand_dims(state_array, axis=0), np.expand_dims(state_array, axis=0), state_array

    def run(self, episodes):
        """Run the agent for a specified number of episodes"""
        total_rewards = []

        for episode in range(episodes):
            state = self.envs.reset()
            state0, state1, state2 = self.expand(state)
            hidden_state = (torch.zeros(self.num_layers, 1, self.hidden_size, dtype=torch.float16).to(self.device),
                          torch.zeros(self.num_layers, 1, self.hidden_size, dtype=torch.float16).to(self.device))
            episode_reward = 0
            episode_memory = deque(maxlen=self.seq_len)
            done = False

            while not done:
                state_tensor = torch.tensor(state2, dtype=torch.float32).to(self.device).unsqueeze(0)

                if np.random.rand() <= self.epsilon:
                    action = self.envs.action_space.sample()
                else:
                    with torch.no_grad(), autocast(device_type=self.device.type, dtype=torch.float16):
                        q_values, hidden_state = self.model(state_tensor, hidden_state)
                        action = torch.argmax(q_values, dim=1).item()

                next_state, reward, done, _ = self.envs.step(action)
                next_state0, next_state1, next_state2 = self.expand(next_state)
                episode_reward += reward
                episode_memory.append((state2, action, reward, next_state2, done, hidden_state))
                state0, state1, state2 = next_state0, next_state1, next_state2

                if len(episode_memory) >= self.seq_len or done:
                    states, next_states, hidden_states, actions, rewards, dones = self.tbtt(episode_memory)

                    if states is not None:
                        with torch.no_grad(), autocast(device_type=self.device.type, dtype=torch.float16):
                            target_q_values, _ = self.target_model(next_states, hidden_states)
                            max_next_q_values = torch.max(target_q_values, dim=1, keepdim=True)[0]
                            target_q = rewards + (1 - dones) * self.gamma * max_next_q_values

                        # Get Q values from model
                        self.optimizer.zero_grad()
                        with autocast(device_type=self.device.type, dtype=torch.float16):
                            current_q_values, _ = self.model(states, hidden_states)
                            current_q_values = current_q_values.gather(1, actions)
                            loss = self.criterion(current_q_values, target_q)

                        self.scaler.scale(loss).backward()
                        self.scaler.step(self.optimizer)
                        self.scaler.update()

                    episode_memory = deque(maxlen=self.seq_len)

            # Update epsilon
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

            # Update target network periodically
            if episode % 10 == 0:
                self.target_model.load_state_dict(self.model.state_dict())

            total_rewards.append(episode_reward)
            print(f"Episode {episode+1}/{episodes}, Reward: {episode_reward:.2f}, Epsilon: {self.epsilon:.2f}")

        return total_rewards

# Hyper Parameters

### Neural Network Size

In [None]:
input_size = 15
hidden_size = 50
output_size = 4

### Training Config

In [None]:
seq_len = 30
batch_size = 1024
memory_size = 100_000

### Epsilon Decay Config

In [None]:
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995

### Misc

In [None]:
learning_rate = 1e-3
num_episodes = 1000
gamma = 0.99

### Device Selection

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialization

In [None]:
envs = gym.vector.AsyncVectorEnv([
    lambda: make_function(1),
    lambda: make_function(2),
    lambda: make_function(3)
])

agent = TrainingAgent(
  envs=envs,
  hidden_size=hidden_size,
  learning_rate=learning_rate,
  gamma=gamma,
  epsilon=epsilon,
  epsilon_min=epsilon_min,
  epsilon_decay=epsilon_decay,
  seq_len=seq_len
)

# Train

In [None]:
agent.train(num_episodes=50)