<a href="https://colab.research.google.com/github/AglaiaVas/Building-a-Trading-Agent-with-Reinforcement-Learning/blob/main/HyperParameter_simple_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import talib

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Change directory to the equivalent of G:\My Drive\Colab Notebooks\Project
os.chdir('/content/drive/My Drive/Colab Notebooks/Project')

# Verify the current working directory
print("Current Working Directory:", os.getcwd())

# Now you can access your .py files within this folder


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current Working Directory: /content/drive/My Drive/Colab Notebooks/Project


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib inline
from pathlib import Path
from time import time
from collections import deque
from random import sample


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import gym
from gym.envs.registration import register


In [None]:
# Set NumPy random seed
np.random.seed(42)

# Set PyTorch random seed
torch.manual_seed(42)

# If you are using CUDA (GPU) as well, you should also set the seed for all GPUs
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)  # If you are using multi-GPU

In [None]:
sns.set_style('whitegrid')

In [None]:
if torch.cuda.is_available():
    print('Using GPU')
    # Optional: Set memory growth (PyTorch handles memory management differently,
    # but setting device or manual memory management is common).
    device = torch.device('cuda')
else:
    print('Using CPU')
    device = torch.device('cpu')

Using GPU


In [None]:
results_path = Path('results', 'trading_agent')
if not results_path.exists():
    results_path.mkdir(parents=True)

In [None]:
def format_time(t):
    m_, s = divmod(t, 60)
    h, m = divmod(m_, 60)
    return '{:02.0f}:{:02.0f}:{:02.0f}'.format(h, m, s)

Now we create our gym enviroment using the classes created in Trading_Environment_Add_py  

In [None]:
trading_days = 252

In [None]:
register(
    id='trading_agent-v1',
    entry_point='Trading_Environment_Add:TradingEnvironment',
    max_episode_steps=trading_days
)


Now set all the parameters of the model

In [None]:
trading_cost_bps = 1e-5
time_cost_bps = 1e-4

In [None]:
f'Trading costs: {trading_cost_bps:.4%} | Time costs: {time_cost_bps:.4%}'

'Trading costs: 0.0010% | Time costs: 0.0100%'

In [None]:
from gym.wrappers import StepAPICompatibility

# Wrap your environment with the compatibility wrapper
trading_environment = gym.make('trading_agent-v0',
                               ticker='^GSPC',
                               trading_days=trading_days,
                               trading_cost_bps=trading_cost_bps,
                               time_cost_bps=time_cost_bps)

# Add the compatibility wrapper
trading_environment = StepAPICompatibility(trading_environment)


INFO:Trading_Environment_Add:Loading data for ^GSPC starting from October 2004...
[*********************100%***********************]  1 of 1 completed
INFO:Trading_Environment_Add:Successfully retrieved data for ^GSPC.
INFO:Trading_Environment_Add:None


Raw data saved to ^GSPC_raw_data.csv
Raw data:
                   Close      Volume          Low         High
Date                                                         
2004-10-01  1131.500000  1582200000  1114.579956  1131.640015
2004-10-04  1135.170044  1534000000  1131.500000  1140.130005
2004-10-05  1134.479980  1418400000  1132.030029  1137.869995
2004-10-06  1142.050049  1416700000  1132.939941  1142.050049
2004-10-07  1130.650024  1447500000  1130.500000  1142.050049
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5004 entries, 2004-11-17 to 2024-10-04
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   returns    5004 non-null   float64
 1   ret_2      5004 non-null   float64
 2   ret_5      5004 non-null   float64
 3   ret_10     5004 non-null   float64
 4   ret_21     5004 non-null   float64
 5   rsi        5004 non-null   float64
 6   macd       5004 non-null   float64
 7   atr        5004 non-null   f

In [None]:
state_dim = trading_environment.observation_space.shape[0]
num_actions = trading_environment.action_space.n
max_episode_steps = trading_environment.spec.max_episode_steps

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
import logging

# Set up logging configuration
logging.basicConfig(level=logging.DEBUG)

class DDQNAgent:
    def __init__(self, state_dim,
                 num_actions,
                 learning_rate,
                 gamma,
                 epsilon_start,
                 epsilon_end,
                 epsilon_decay_steps,
                 epsilon_exponential_decay,
                 replay_capacity,
                 architecture,
                 l2_reg,
                 tau,
                 batch_size):
        # Initialize parameters for the DDQN agent
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.experience = deque([], maxlen=replay_capacity)  # Replay buffer
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.architecture = architecture
        self.l2_reg = l2_reg

        # Set device to GPU if available, otherwise use CPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Build the online and target networks, and move them to the correct device
        self.online_network = self.build_model().to(self.device)
        self.target_network = self.build_model().to(self.device)
        self.update_target()

        # Initialize epsilon for epsilon-greedy action selection
        self.epsilon = epsilon_start
        self.epsilon_decay_steps = epsilon_decay_steps
        self.epsilon_decay = (epsilon_start - epsilon_end) / epsilon_decay_steps
        self.epsilon_exponential_decay = epsilon_exponential_decay
        self.epsilon_history = []

        # Track metrics for episodes, steps, rewards, and training
        self.total_steps = 0
        self.episodes = 0
        self.episode_length = 0
        self.rewards_history = []
        self.steps_per_episode = []
        self.episode_reward = 0

        # Set batch size and target network update frequency (tau)
        self.batch_size = batch_size
        self.tau = tau
        self.losses = []
        self.train = True

        # Optimizer and loss function
        self.optimizer = optim.Adam(self.online_network.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

    def build_model(self):
        """
        Build the neural network model based on the architecture provided.
        The model will be a sequential feed-forward network with ReLU activation.
        """
        layers = []
        input_dim = self.state_dim

        for units in self.architecture:
            layers.append(nn.Linear(input_dim, units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.1))  # Add dropout for regularization
            input_dim = units

        # Output layer to map to num_actions
        layers.append(nn.Linear(input_dim, self.num_actions))

        return nn.Sequential(*layers)

    def update_target(self):
        """
        Copy the weights from the online network to the target network.
        This is done periodically to stabilize training.
        """
        self.target_network.load_state_dict(self.online_network.state_dict())

    def epsilon_greedy_policy(self, state):
        """
        Select an action using epsilon-greedy policy.
        With probability epsilon, choose a random action (exploration).
        Otherwise, choose the action with the highest Q-value (exploitation).
        """
        self.total_steps += 1

        # Explore: select a random action
        if random.random() <= self.epsilon:
            return random.choice(range(self.num_actions))  # Random action (exploration)

        # Exploit: select the action with the highest Q-value
        state_tensor = torch.FloatTensor(state).to(self.device).unsqueeze(0)  # Add batch dimension

        # Get Q-values from the online network
        with torch.no_grad():  # No gradient computation during inference
            q_values = self.online_network(state_tensor).cpu().numpy()  # Get Q-values and convert to NumPy

        # Select the action with the highest Q-value
        action = np.argmax(q_values)  # Get index of the max Q-value
        return action  # Return the action as an integer

    def memorize_transition(self, s, a, r, s_prime, not_done):
        """
        Store a transition (s, a, r, s') in the replay buffer.
        If the episode ends (done), handle epsilon decay and reset episode metrics.
        """
        if not_done:
            self.episode_reward += r
            self.episode_length += 1
        else:
            if self.train:
                if self.episodes < self.epsilon_decay_steps:
                    self.epsilon -= self.epsilon_decay
                else:
                    self.epsilon *= self.epsilon_exponential_decay

            self.episodes += 1
            self.rewards_history.append(self.episode_reward)
            self.steps_per_episode.append(self.episode_length)
            self.episode_reward, self.episode_length = 0, 0

        self.experience.append((s, a, r, s_prime, not_done))

    def experience_replay(self):
        """
        Train the online network using experience replay.
        Sample a random batch of transitions from the replay buffer and update the network.
        """
        if len(self.experience) < self.batch_size:
            logging.debug("Not enough experiences to sample.")
            return

        # Sample a minibatch from the replay buffer
        minibatch = random.sample(self.experience, self.batch_size)
        states, actions, rewards, next_states, not_done = zip(*minibatch)

        # Convert experience data into PyTorch tensors and move them to the correct device
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device).view(-1)  # Shape: [batch_size]
        next_states = torch.FloatTensor(next_states).to(self.device)
        not_done = torch.FloatTensor(not_done).to(self.device).view(-1)  # Shape: [batch_size]

        # Debugging statements to check shapes
        logging.debug("Shapes before target calculation:")
        logging.debug(f"rewards: {rewards.shape}, not_done: {not_done.shape}")

        # Compute Q-values for the next states using the online network
        next_q_values = self.online_network(next_states)

        # Get the best actions for the next states
        best_actions = next_q_values.argmax(dim=1)

        # Compute Q-values for the next states using the target network
        next_q_values_target = self.target_network(next_states)

        # Gather target Q-values using the actions from next states
        target_q_values = next_q_values_target.gather(1, best_actions.unsqueeze(1)).squeeze(1)  # Shape: [batch_size]

        # Debugging statement to check the shape of target_q_values
        logging.debug(f"target_q_values shape: {target_q_values.shape}")

        # Compute the target Q-value: r + gamma * max(Q(s', a'))
        targets = rewards + not_done * self.gamma * target_q_values

        # Compute Q-values for the current states using the online network
        q_values = self.online_network(states)

        # Gather the Q-values corresponding to the actions taken
        q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1) # Select Q-values for actions taken


        # Compute the loss (difference between predicted and target Q-values)
        loss = self.criterion(q_values, targets.detach())

        # Backpropagate the loss and
        # Backpropagate the loss and update the network weights
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Track the loss for analysis
        self.losses.append(loss.item())

        # Periodically update the target network based on tau
        if self.total_steps % self.tau == 0:
            self.update_target()


In [None]:
gamma = .99  # discount factor
tau = 100  # target network update frequency
#NN Architecture
architecture = (256, 256)  # units per layer
learning_rate = 0.0001  # learning rate
l2_reg = 1e-6  # L2 regularization
#Experience Replay
replay_capacity = int(1e6)
batch_size = 4096
#greedy Policy
epsilon_start = 1.0
epsilon_end = .01
epsilon_decay_steps = 250
epsilon_exponential_decay = .99

In [None]:
ddqn = DDQNAgent(state_dim=state_dim,
                 num_actions=num_actions,
                 learning_rate=learning_rate,
                 gamma=gamma,
                 epsilon_start=epsilon_start,
                 epsilon_end=epsilon_end,
                 epsilon_decay_steps=epsilon_decay_steps,
                 epsilon_exponential_decay=epsilon_exponential_decay,
                 replay_capacity=replay_capacity,
                 architecture=architecture,
                 l2_reg=l2_reg,
                 tau=tau,
                 batch_size=batch_size)

In [None]:
total_steps = 0
max_episodes = 10
#Initialize variables
episode_time, navs, market_navs, diffs, episode_eps = [], [], [], [], []
#Visualization
def track_results(episode, nav_ma_100, nav_ma_10,
                  market_nav_100, market_nav_10,
                  win_ratio, total, epsilon):
    time_ma = np.mean([episode_time[-100:]])
    T = np.sum(episode_time)

    template = '{:>4d} | {} | Agent: {:>6.1%} ({:>6.1%}) | '
    template += 'Market: {:>6.1%} ({:>6.1%}) | '
    template += 'Wins: {:>5.1%} | eps: {:>6.3f}'
    print(template.format(episode, format_time(total),
                          nav_ma_100-1, nav_ma_10-1,
                          market_nav_100-1, market_nav_10-1,
                          win_ratio, epsilon))

In [None]:
# Train Agent
start = time()
results = []
for episode in range(1, max_episodes + 1):
    this_state = trading_environment.reset()
    logging.debug(f"Initial state: {this_state}")

    for episode_step in range(max_episode_steps):
        action = ddqn.epsilon_greedy_policy(this_state.reshape(-1, state_dim))
        logging.debug(f"Action taken: {action}")

        next_state, reward, done, _ = trading_environment.step(action)
        logging.debug(f"Next state: {next_state}, Reward: {reward}, Done: {done}")

        ddqn.memorize_transition(this_state,
                                 action,
                                 reward,
                                 next_state,
                                 0.0 if done else 1.0)
        if ddqn.train:
            ddqn.experience_replay()

        if done:
            break
        this_state = next_state

    # get DataFrame with sequence of actions, returns, and nav values
    result = trading_environment.env.simulator.results()

    # get results of last step
    final = result.iloc[-2]
    logging.debug(f"Final result of episode {episode}: {final}")

    # apply return (net of cost) of last action to last starting nav
    nav = final.nav * (1 + final.strategy_return)
    navs.append(nav)

    # market nav
    market_nav = final.market_nav
    market_navs.append(market_nav)

    # track difference between agent and market NAV results
    diff = nav - market_nav
    diffs.append(diff)

    if episode % 10 == 0:
        track_results(episode,
                      np.mean(navs[-100:]),
                      np.mean(navs[-10:]),
                      np.mean(market_navs[-100:]),
                      np.mean(market_navs[-10:]),
                      np.sum([s > 0 for s in diffs[-100:]])/min(len(diffs), 100),
                      time() - start, ddqn.epsilon)
    if len(diffs) > 25 and all([r > 0 for r in diffs[-25:]]):
        print(result.tail())
        break

trading_environment.close()




  10 | 00:00:01 | Agent:  -4.1% ( -4.1%) | Market:   6.3% (  6.3%) | Wins: 20.0% | eps:  0.960


In [None]:
# Ensure that all lists are the same length as the episode count
min_length = min(len(navs), len(market_navs), len(diffs), episode)

# Truncate lists to match the smallest length if necessary
navs = navs[:min_length]
market_navs = market_navs[:min_length]
diffs = diffs[:min_length]

# Construct the DataFrame
final_results = pd.DataFrame({
    'Episode': list(range(1, min_length + 1)),
    'Agent': navs,
    'Market': market_navs,
    'Difference': diffs
}).set_index('Episode')

In [None]:
# Calculate the rolling mean over the last 100 episodes for both Agent and Market NAVs, subtracting 1 and showing as percentage
final_results['Agent_rolling_mean'] = (final_results['Agent'].rolling(window=100).mean() - 1) * 100
final_results['Market_rolling_mean'] = (final_results['Market'].rolling(window=100).mean() - 1) * 100

# Compute the rolling win ratio over the last 100 points, where the agent outperformed the market
final_results['Wins_rolling_mean'] = (final_results['Difference'] > 0).rolling(window=100).mean()

In [None]:
#Only uncomment the below if you need to install the Talib library


url = 'https://anaconda.org/conda-forge/libta-lib/0.4.0/download/linux-64/libta-lib-0.4.0-h166bdaf_1.tar.bz2'
!curl -L $url | tar xj -C /usr/lib/x86_64-linux-gnu/ lib --strip-components=1
url = 'https://anaconda.org/conda-forge/ta-lib/0.4.19/download/linux-64/ta-lib-0.4.19-py310hde88566_4.tar.bz2'
!curl -L $url | tar xj -C /usr/local/lib/python3.10/dist-packages/ lib/python3.10/site-packages/talib --strip-components=3
import talib

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4035    0  4035    0     0  17306      0 --:--:-- --:--:-- --:--:-- 17317
100  517k  100  517k    0     0   823k      0 --:--:-- --:--:-- --:--:--  823k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4031    0  4031    0     0  10214      0 --:--:-- --:--:-- --:--:-- 10230
100  392k  100  392k    0     0   504k      0 --:--:-- --:--:-- --:--:--  504k


In [None]:
from sklearn.model_selection import ParameterGrid
import numpy as np
import gym

# Initialize your actual trading environment (replace with your actual environment)
trading_environment = gym.make('CartPole-v1')  # Replace with your trading environment

state_dim = trading_environment.observation_space.shape[0]
num_actions = trading_environment.action_space.n
max_episode_steps = trading_environment.spec.max_episode_steps
replay_capacity = int(1e6)  # Convert the replay capacity to an integer

# Define the hyperparameter grid
param_grid = {
    'learning_rate': [0.0001, 0.001, 0.01],
    'gamma': [0.9, 0.95, 0.99],
    'epsilon_decay': [0.99, 0.995, 0.999]
}

# Create the grid of hyperparameter combinations
grid = ParameterGrid(param_grid)

# Function to train the agent and evaluate performance
def train_and_evaluate(env, learning_rate, gamma, epsilon_decay, episodes=100):
    # Initialize your DDQN agent with the current hyperparameters
    agent = DDQNAgent(state_dim, num_actions,
                      gamma=gamma, tau=100, architecture=(256, 256),
                      learning_rate=learning_rate, l2_reg=1e-6,
                      replay_capacity=replay_capacity, batch_size=4096,
                      epsilon_start=1.0, epsilon_end=0.01,
                      epsilon_decay_steps=250, epsilon_exponential_decay=epsilon_decay)

    # Run the agent for 10 episodes
    total_rewards = []
    for episode in range(episodes):
        state = env.reset()  # Reset the environment at the start of each episode
        done = False
        total_reward = 0

        while not done:
            # The agent selects an action based on the current state
            action = agent.epsilon_greedy_policy(state)

            # Take the action in the environment, observe next state, reward, and done flag
            next_state, reward, done, info = env.step(action)

            # Store the transition in memory and train the agent
            agent.memorize_transition(state, action, reward, next_state, not done)
            agent.experience_replay()

            # Update the state
            state = next_state
            total_reward += reward

        total_rewards.append(total_reward)

    # Return the average reward over all episodes
    return np.mean(total_rewards)

# Perform the hyperparameter tuning
best_params = None
best_reward = -np.inf

for params in grid:
    print(f"Testing params: {params}")
    avg_reward = train_and_evaluate(trading_environment, params['learning_rate'], params['gamma'], params['epsilon_decay'], episodes=100)

    print(f"Average Reward: {avg_reward}")

    # Track the best-performing set of hyperparameters
    if avg_reward > best_reward:
        best_reward = avg_reward
        best_params = params

# Output the best set of hyperparameters and its performance
print(f"Best Hyperparameters: {best_params} with reward {best_reward}")


Testing params: {'epsilon_decay': 0.99, 'gamma': 0.9, 'learning_rate': 0.0001}
Average Reward: 23.0
Testing params: {'epsilon_decay': 0.99, 'gamma': 0.9, 'learning_rate': 0.001}
Average Reward: 22.36
Testing params: {'epsilon_decay': 0.99, 'gamma': 0.9, 'learning_rate': 0.01}
Average Reward: 18.82
Testing params: {'epsilon_decay': 0.99, 'gamma': 0.95, 'learning_rate': 0.0001}
Average Reward: 17.67
Testing params: {'epsilon_decay': 0.99, 'gamma': 0.95, 'learning_rate': 0.001}
Average Reward: 21.52
Testing params: {'epsilon_decay': 0.99, 'gamma': 0.95, 'learning_rate': 0.01}
Average Reward: 19.81
Testing params: {'epsilon_decay': 0.99, 'gamma': 0.99, 'learning_rate': 0.0001}
Average Reward: 21.58
Testing params: {'epsilon_decay': 0.99, 'gamma': 0.99, 'learning_rate': 0.001}
Average Reward: 26.03
Testing params: {'epsilon_decay': 0.99, 'gamma': 0.99, 'learning_rate': 0.01}
Average Reward: 19.78
Testing params: {'epsilon_decay': 0.995, 'gamma': 0.9, 'learning_rate': 0.0001}
Average Reward: