In [2]:
!pip install stable-baselines3


Collecting stable-baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Collecting gymnasium<0.30,>=0.28.1 (from stable-baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium<0.30,>=0.28.1->stable-baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13->stable-baselines3)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13->stable-baselines3)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13->stable-baselines3)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13->stable-ba

In [10]:
!pip install 'shimmy>=0.2.1'




In [3]:
import os
import pandas as pd

# Paths
folder_path = '/content/Final stocks'  # Path to the folder containing the stock CSVs

# Loop through each CSV file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)

        # Read the CSV file
        df = pd.read_csv(file_path)

        # Convert the 'Date' column to datetime format
        df['Date'] = pd.to_datetime(df['Date'])

        # Create training data for 2018-2019
        train_df = df[(df['Date'] >= '2018-01-01') & (df['Date'] <= '2019-12-31')]

        # Create testing data for 2020
        test_df = df[(df['Date'] >= '2020-01-01') & (df['Date'] <= '2020-12-31')]

        # Get the stock symbol from the filename (e.g., 'AA' from 'AA.csv')
        symbol = os.path.splitext(filename)[0]

        # Dynamically create variable names for train and test DataFrames
        globals()[f"train_{symbol}_df"] = train_df
        globals()[f"test_{symbol}_df"] = test_df

        print(f"DataFrames for {symbol} created: train_{symbol}_df, test_{symbol}_df")


DataFrames for ABEV created: train_ABEV_df, test_ABEV_df
DataFrames for ACAMU created: train_ACAMU_df, test_ACAMU_df
DataFrames for ACBI created: train_ACBI_df, test_ACBI_df
DataFrames for A created: train_A_df, test_A_df
DataFrames for AACG created: train_AACG_df, test_AACG_df
DataFrames for AB created: train_AB_df, test_AB_df
DataFrames for ACAM created: train_ACAM_df, test_ACAM_df
DataFrames for ACP created: train_ACP_df, test_ACP_df
DataFrames for ACLS created: train_ACLS_df, test_ACLS_df
DataFrames for ACC created: train_ACC_df, test_ACC_df


In [6]:
test_ABEV_df.head()

  and should_run_async(code)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
5745,2020-01-02,4.67,4.73,4.67,4.73,4.73,10162700
5746,2020-01-03,4.64,4.7,4.63,4.67,4.67,8742700
5747,2020-01-06,4.61,4.69,4.59,4.62,4.62,12846500
5748,2020-01-07,4.6,4.7,4.58,4.63,4.63,13782900
5749,2020-01-08,4.63,4.69,4.59,4.64,4.64,18280200


In [5]:
import gym
from gym import spaces
import numpy as np

class SimpleTradingEnv(gym.Env):
    def __init__(self, data):
        super(SimpleTradingEnv, self).__init__()

        self.data = data
        self.current_step = 0

        # Define action space: 0 = Sell, 1 = Hold, 2 = Buy
        self.action_space = spaces.Discrete(3)

        # Define observation space (only using price data)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(1,), dtype=np.float32)

        # Initial balance
        self.balance = 10000
        self.shares_held = 0
        self.net_worth = self.balance
        self.max_net_worth = self.balance

    def reset(self):
        self.current_step = 0
        self.balance = 10000
        self.shares_held = 0
        self.net_worth = self.balance
        self.max_net_worth = self.balance
        return self._next_observation()

    def _next_observation(self):
        # Only using the 'Close' price as observation
        frame = np.array([self.data.iloc[self.current_step]['Close']])

        # Check for NaNs and replace with zero
        frame = np.nan_to_num(frame, nan=0.0)

        # Normalize the observation (optional)
        frame = frame / self.data['Close'].max()

        return frame

    def step(self, action):
        current_price = self.data.iloc[self.current_step]['Close']

        if action == 0:  # Sell
            if self.shares_held > 0:
                self.balance += self.shares_held * current_price
                self.shares_held = 0
        elif action == 2:  # Buy
            if self.balance > 0:
                shares_to_buy = self.balance // current_price
                self.shares_held += shares_to_buy
                self.balance -= shares_to_buy * current_price

        self.current_step += 1
        done = self.current_step >= len(self.data) - 1

        self.net_worth = self.balance + self.shares_held * current_price
        self.max_net_worth = max(self.max_net_worth, self.net_worth)

        # Normalize the reward
        reward = (self.net_worth - 10000) / 10000

        # Ensure reward is not NaN
        if np.isnan(reward):
            reward = 0

        obs = self._next_observation()
        return obs, reward, done, {}

    def render(self, mode='human'):
        profit = self.net_worth - 10000
        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(f'Shares held: {self.shares_held}')
        print(f'Net worth: {self.net_worth}')
        print(f'Profit: {profit}')

    def close(self):
        pass



In [28]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ABEV_df)

# Initialize the PPO model with the training environment
model = PPO("MlpPolicy", train_env, verbose=1, learning_rate=0.0001)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("ppo_trading_model")

# Load the model for testing
model = PPO.load("ppo_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ABEV_df)

# Test the trained model
# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 502      |
|    ep_rew_mean     | 42.6     |
| time/              |          |
|    fps             | 531      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 4
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 5
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 6
Balance: 1.8601665496826172
Shares held: 2183.0
Net worth: 10000.0
Profit: 0.0
Step: 7
Balance: 1.8601665496826172
Shares held: 2183.0
Net worth: 10065.490458011627
Profit: 65.4904580116272
Step: 8
Balance: 1.860

In [29]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_AB_df)

# Initialize the PPO model with the training environment
model = PPO("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("ppo_trading_model")

# Load the model for testing
model = PPO.load("ppo_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_AB_df)

# Test the trained model
# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 502      |
|    ep_rew_mean     | 76.2     |
| time/              |          |
|    fps             | 459      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 4
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 5
Balance: 27.359928131101697
Shares held: 314.0
Net worth: 10000.0
Profit: 0.0
Step: 6
Balance: 27.359928131101697
Shares held: 314.0
Net worth: 9984.299640655516
Profit: -15.70035934448424
Step: 7
Balance: 27.359928131101697
Shares held: 314.0
Net worth: 10037.679664611815
Profit:

In [30]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACP_df)

# Initialize the PPO model with the training environment
model = PPO("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("ppo_trading_model")

# Load the model for testing
model = PPO.load("ppo_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACP_df)

# Test the trained model
# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 502      |
|    ep_rew_mean     | -20.2    |
| time/              |          |
|    fps             | 556      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
Step: 1
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 9913.499670028687
Profit: -86.50032997131348
Step: 3
Balance: 9948.099637031555
Shares held: 0
Net worth: 9948.099637031555
Profit: -51.900362968444824
Step: 4
Balance: 9.239867210388184
Shares held: 862.0
Net worth: 9948.099637031555
Profit: -51.900362968444824
Step: 5
Balance: 9.239867210388184
Shares held: 862.0
Net worth: 9930.86006450653
Profit: -69.13993549346924
Step: 6
Balance: 9.239867210388184
Shares

In [31]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACC_df)

# Initialize the PPO model with the training environment
model = PPO("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("ppo_trading_model")

# Load the model for testing
model = PPO.load("ppo_trading_model")

# Create the environment with testing data
# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 502      |
|    ep_rew_mean     | 18.1     |
| time/              |          |
|    fps             | 480      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 4
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 5
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 6
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 7
Balance: 6.36003303527832
Shares held: 866.0
Net worth: 10000.0
Profit: 0.0
Step: 8
Balance: 6.36003303527832
Shares held: 866.0
Net worth: 101

In [32]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACAM_df)

# Initialize the PPO model with the training environment
model = PPO("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("ppo_trading_model")

# Load the model for testing
model = PPO.load("ppo_trading_model")

# Create the environment with testing data
# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 163      |
|    ep_rew_mean     | 0.733    |
| time/              |          |
|    fps             | 551      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
Step: 1
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 9913.499670028687
Shares held: 0
Net worth: 9913.499670028687
Profit: -86.50032997131348
Step: 3
Balance: 0.49967002868652344
Shares held: 862.0
Net worth: 9913.499670028687
Profit: -86.50032997131348
Step: 4
Balance: 0.49967002868652344
Shares held: 862.0
Net worth: 9939.359439849854
Profit: -60.640560150146484
Step: 5
Balance: 9922.11986732483
Shares held: 0
Net worth: 9922.11986732483
Profit: -77.8801326751709
Step: 6
Balance: 9922.11986732483
Shares held

In [33]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_AACG_df)

# Initialize the PPO model with the training environment
model = PPO("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("ppo_trading_model")

# Load the model for testing
model = PPO.load("ppo_trading_model")

# Create the environment with testing data
# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 197      |
|    ep_rew_mean     | 108      |
| time/              |          |
|    fps             | 510      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10034.87996673584
Profit: 34.879966735839844
Step: 4
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10061.039733886719
Profit: 61.03973388671875
Step: 5
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10043.6001663208
Profit: 43.60016632080078
Step: 6
Balance: 10078.48013305664
Shares held: 0
Net worth: 10078.48013305664
Prof

In [34]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACBI_df)

# Initialize the PPO model with the training environment
model = PPO("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("ppo_trading_model")

# Load the model for testing
model = PPO.load("ppo_trading_model")

# Create the environment with testing data
# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 502      |
|    ep_rew_mean     | 91       |
| time/              |          |
|    fps             | 544      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 10034.87996673584
Shares held: 0
Net worth: 10034.87996673584
Profit: 34.879966735839844
Step: 4
Balance: 10034.87996673584
Shares held: 0
Net worth: 10034.87996673584
Profit: 34.879966735839844
Step: 5
Balance: 10034.87996673584
Shares held: 0
Net worth: 10034.87996673584
Profit: 34.879966735839844
Step: 6
Balance: 10034.87996673584
Shares held: 0
Net worth: 10034.87996673584
Profit: 34.879

In [35]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_A_df)

# Initialize the PPO model with the training environment
model = PPO("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("ppo_trading_model")

# Load the model for testing
model = PPO.load("ppo_trading_model")

# Create the environment with testing data
# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 502      |
|    ep_rew_mean     | 14.1     |
| time/              |          |
|    fps             | 555      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10034.87996673584
Profit: 34.879966735839844
Step: 4
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10061.039733886719
Profit: 61.03973388671875
Step: 5
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10043.6001663208
Profit: 43.60016632080078
Step: 6
Balance: 10078.48013305664
Shares held: 0
Net worth: 10078.48013305664
Prof

In [36]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACLS_df)

# Initialize the PPO model with the training environment
model = PPO("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("ppo_trading_model")

# Load the model for testing
model = PPO.load("ppo_trading_model")

# Create the environment with testing data
# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 502      |
|    ep_rew_mean     | -56.3    |
| time/              |          |
|    fps             | 508      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10034.87996673584
Profit: 34.879966735839844
Step: 4
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10061.039733886719
Profit: 61.03973388671875
Step: 5
Balance: 6.879966735839844
Shares held: 872.0
Net worth: 10043.6001663208
Profit: 43.60016632080078
Step: 6
Balance: 10078.48013305664
Shares held: 0
Net worth: 10078.48013305664
Prof

In [37]:
import gym
from stable_baselines3 import PPO
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACAMU_df)

# Initialize the PPO model with the training environment
model = PPO("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("ppo_trading_model")

# Load the model for testing
model = PPO.load("ppo_trading_model")

# Create the environment with testing data
# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 216      |
|    ep_rew_mean     | 0.785    |
| time/              |          |
|    fps             | 429      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
Step: 1
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 9913.499670028687
Profit: -86.50032997131348
Step: 3
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 9948.099637031555
Profit: -51.900362968444824
Step: 4
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 9974.049406051636
Profit: -25.950593948364258
Step: 5
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 9956.749835014343
Profit: -43.25016498565674
Step: 6
Balance: 0.59963703155517

DQN

In [38]:
import gym
from stable_baselines3 import DQN
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ABEV_df)

# Initialize the DQN model with the training environment
model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("dqn_trading_model")

# Load the model for testing
model = DQN.load("dqn_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ABEV_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Step: 1
Balance: 0.7799596786499023
Shares held: 2114.0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 0.7799596786499023
Shares held: 2114.0
Net worth: 9873.16012096405
Profit: -126.8398790359497
Step: 3
Balance: 0.7799596786499023
Shares held: 2114.0
Net worth: 9767.45971775055
Profit: -232.54028224945068
Step: 4
Balance: 0.7799596786499023
Shares held: 2114.0
Net worth: 9788.60020160675
Profit: -211.3997983932495
Step: 5
Balance: 0.7799596786499023
Shares held: 2114.0
Net worth: 9809.7396774292
Profit: -190.26032257080078
Step: 6
Balance: 0.7799596786499023
Shares held: 2114.0
Net worth: 9682.89979839325
Profit: -317.1002016067505
Step: 7
Balance: 0.7799596786499023
Shares held: 2114.0
Net worth: 9746.3202419281
Profit: -253.6797580718994
Step: 8
Balance: 0.7799596786499023
Shares held: 2114.0
Net worth: 9725.1797580719
Profit: -274.8202419281006
Step: 9
Balance: 0.7799596786499023
Shares

In [39]:
import gym
from stable_baselines3 import DQN
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_AB_df)

# Initialize the DQN model with the training environment
model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("dqn_trading_model")

# Load the model for testing
model = DQN.load("dqn_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_AB_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 4
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 5
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 6
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 7
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 8
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 9
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 10
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 11
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 12
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 13
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step

In [40]:
import gym
from stable_baselines3 import DQN
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACP_df)

# Initialize the DQN model with the training environment
model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("dqn_trading_model")

# Load the model for testing
model = DQN.load("dqn_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACP_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Step: 1
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 9913.499670028687
Profit: -86.50032997131348
Step: 3
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 9948.099637031555
Profit: -51.900362968444824
Step: 4
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 9974.049406051636
Profit: -25.950593948364258
Step: 5
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 9956.749835014343
Profit: -43.25016498565674
Step: 6
Balance: 0.5996370315551758
Shares held: 865.0
Net worth: 9991.349802017212
Profit: -8.650197982788086
Step: 7
Balance: 9982.699604034424
Shares held: 0
Net worth: 9982.699604034424
Profit: -17.300395965576172
Step: 8
Balance: 7.97934341430846
Shares held: 854.0
Net worth: 9982.699604034424
Profit: -17.300395965576172
Step: 9
Balance: 7.97934341430846
Shares held

In [41]:
import gym
from stable_baselines3 import DQN
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACC_df)

# Initialize the DQN model with the training environment
model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("dqn_trading_model")

# Load the model for testing
model = DQN.load("dqn_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACC_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 4
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 5
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 6
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 7
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 8
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 9
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 10
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 11
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 12
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 13
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step

In [42]:
import gym
from stable_baselines3 import DQN
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACAM_df)

# Initialize the DQN model with the training environment
model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("dqn_trading_model")

# Load the model for testing
model = DQN.load("dqn_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACAM_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 163      |
|    ep_rew_mean      | 1.06     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 389      |
|    time_elapsed     | 1        |
|    total_timesteps  | 652      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000121 |
|    n_updates        | 137      |
----------------------------------
Step: 1
Balance: 0.010229110717773438
Shares held: 1001.0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 0.010229110717773438
Shares held: 1001.0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 0.010229110717773438
Shares held: 1001.0
Net worth: 10000.0
Profit: 0.0
Step: 4
Balance: 0.010229110717773438
Shares held: 1001.0
Net worth: 10040.03996181488
Profit: 40.03996181488037


In [43]:
import gym
from stable_baselines3 import DQN
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_AACG_df)

# Initialize the DQN model with the training environment
model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("dqn_trading_model")

# Load the model for testing
model = DQN.load("dqn_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_AACG_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 197      |
|    ep_rew_mean      | 108      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 500      |
|    time_elapsed     | 1        |
|    total_timesteps  | 788      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0867   |
|    n_updates        | 171      |
----------------------------------
Step: 1
Balance: 0.5498234033584595
Shares held: 7407.0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 0.5498234033584595
Shares held: 7407.0
Net worth: 10888.840035319328
Profit: 888.8400353193283
Step: 3
Balance: 0.5498234033584595
Shares held: 7407.0
Net worth: 10592.559434890747
Profit: 592.5594348907471
Step: 4
Balance: 0.5498234033584595
Shares held: 7407.0
Net worth: 10

In [44]:
import gym
from stable_baselines3 import DQN
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACBI_df)

# Initialize the DQN model with the training environment
model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("dqn_trading_model")

# Load the model for testing
model = DQN.load("dqn_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACBI_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 4
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 5
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 6
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 7
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 8
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 9
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 10
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 11
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 12
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 13
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step

In [45]:
import gym
from stable_baselines3 import DQN
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_A_df)

# Initialize the DQN model with the training environment
model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("dqn_trading_model")

# Load the model for testing
model = DQN.load("dqn_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_A_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 4
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 5
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 6
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 7
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 8
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 9
Balance: 57.13024139404297
Shares held: 113.0
Net worth: 10000.0
Profit: 0.0
Step: 10
Balance: 57.13024139404297
Shares held: 113.0
Net worth: 10071.190551757812
Profit: 71.1905517578125
Step: 11
Balance: 57.13024139404297
Shares held: 113.0
Net worth: 10166.110137939453
Profit: 166.11013793945312
Step: 12
Balance: 57.13024139404297
Shares hel

In [46]:
import gym
from stable_baselines3 import DQN
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACLS_df)

# Initialize the DQN model with the training environment
model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("dqn_trading_model")

# Load the model for testing
model = DQN.load("dqn_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACLS_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Step: 1
Balance: 20.359813690185547
Shares held: 407.0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 20.359813690185547
Shares held: 407.0
Net worth: 10109.890186309814
Profit: 109.89018630981445
Step: 3
Balance: 20.359813690185547
Shares held: 407.0
Net worth: 9865.690031051636
Profit: -134.30996894836426
Step: 4
Balance: 20.359813690185547
Shares held: 407.0
Net worth: 9943.019472122192
Profit: -56.98052787780762
Step: 5
Balance: 20.359813690185547
Shares held: 407.0
Net worth: 10097.679906845095
Profit: 97.67990684509459
Step: 6
Balance: 20.359813690185547
Shares held: 407.0
Net worth: 10052.909658432009
Profit: 52.909658432008655
Step: 7
Balance: 9881.969627380371
Shares held: 0
Net worth: 9881.969627380371
Profit: -118.0303726196289
Step: 8
Balance: 12.249504089355469
Shares held: 404.0
Net worth: 9881.969627380371
Profit: -118.0303726196289
Step: 9
Balance: 12.249504089355469
Shares h

In [47]:
import gym
from stable_baselines3 import DQN
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACAMU_df)

# Initialize the DQN model with the training environment
model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("dqn_trading_model")

# Load the model for testing
model = DQN.load("dqn_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACAMU_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 216      |
|    ep_rew_mean      | 0.68     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 503      |
|    time_elapsed     | 1        |
|    total_timesteps  | 864      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 5.78e-05 |
|    n_updates        | 190      |
----------------------------------
Step: 1
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 2
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 3
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 4
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 5
Balance: 10000
Shares held: 0
Net worth: 10000.0
Profit: 0.0
Step: 6
Balance: 10000
Shares held:

A2C


In [48]:
import gym
from stable_baselines3 import A2C
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ABEV_df)

# Initialize the A2C model with the training environment
model = A2C("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("a2c_trading_model")

# Load the model for testing
model = A2C.load("a2c_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ABEV_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 150      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | -0.121   |
|    learning_rate      | 0.0001   |
|    n_updates          | 99       |
|    policy_loss        | -0.206   |
|    value_loss         | 0.0447   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 502      |
|    ep_rew_mean        | -57.4    |
| time/                 |          |
|    fps                | 158      |
|    iterations         | 200      |
|    time_elapsed       | 6        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss

In [49]:
import gym
from stable_baselines3 import A2C
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_AB_df)

# Initialize the A2C model with the training environment
model = A2C("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("a2c_trading_model")

# Load the model for testing
model = A2C.load("a2c_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_AB_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 164      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | 0.0647   |
|    learning_rate      | 0.0001   |
|    n_updates          | 99       |
|    policy_loss        | -0.16    |
|    value_loss         | 0.026    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 502      |
|    ep_rew_mean        | -15      |
| time/                 |          |
|    fps                | 182      |
|    iterations         | 200      |
|    time_elapsed       | 5        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss

In [50]:
import gym
from stable_baselines3 import A2C
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACP_df)

# Initialize the A2C model with the training environment
model = A2C("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("a2c_trading_model")

# Load the model for testing
model = A2C.load("a2c_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACP_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 268      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | -0.0217  |
|    learning_rate      | 0.0001   |
|    n_updates          | 99       |
|    policy_loss        | -0.334   |
|    value_loss         | 0.115    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 502      |
|    ep_rew_mean        | -17.4    |
| time/                 |          |
|    fps                | 269      |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss

In [51]:
import gym
from stable_baselines3 import A2C
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACC_df)

# Initialize the A2C model with the training environment
model = A2C("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("a2c_trading_model")

# Load the model for testing
model = A2C.load("a2c_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACC_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 264      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | 0.00167  |
|    learning_rate      | 0.0001   |
|    n_updates          | 99       |
|    policy_loss        | 0.132    |
|    value_loss         | 0.0164   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 502      |
|    ep_rew_mean        | -37.7    |
| time/                 |          |
|    fps                | 268      |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss

In [52]:
import gym
from stable_baselines3 import A2C
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACAM_df)

# Initialize the A2C model with the training environment
model = A2C("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("a2c_trading_model")

# Load the model for testing
model = A2C.load("a2c_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACAM_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 163      |
|    ep_rew_mean        | 1.6      |
| time/                 |          |
|    fps                | 163      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | 0        |
|    learning_rate      | 0.0001   |
|    n_updates          | 99       |
|    policy_loss        | -0.0245  |
|    value_loss         | 0.0006   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 163      |
|    ep_rew_mean        | 1.29     |
| time/                 |          |
|    fps                | 192      |
|    iterations         | 200      |
|    time_elapsed

In [53]:
import gym
from stable_baselines3 import A2C
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_AACG_df)

# Initialize the A2C model with the training environment
model = A2C("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("a2c_trading_model")

# Load the model for testing
model = A2C.load("a2c_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_AACG_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 197      |
|    ep_rew_mean        | -34.5    |
| time/                 |          |
|    fps                | 268      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | 0.000724 |
|    learning_rate      | 0.0001   |
|    n_updates          | 99       |
|    policy_loss        | 7.21     |
|    value_loss         | 53.6     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 197      |
|    ep_rew_mean        | 105      |
| time/                 |          |
|    fps                | 271      |
|    iterations         | 200      |
|    time_elapsed

In [54]:
import gym
from stable_baselines3 import A2C
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACBI_df)

# Initialize the A2C model with the training environment
model = A2C("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("a2c_trading_model")

# Load the model for testing
model = A2C.load("a2c_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACBI_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 268      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | 0.0112   |
|    learning_rate      | 0.0001   |
|    n_updates          | 99       |
|    policy_loss        | 0.893    |
|    value_loss         | 0.8      |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 502      |
|    ep_rew_mean        | 57.9     |
| time/                 |          |
|    fps                | 279      |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss

In [55]:
import gym
from stable_baselines3 import A2C
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_A_df)

# Initialize the A2C model with the training environment
model = A2C("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("a2c_trading_model")

# Load the model for testing
model = A2C.load("a2c_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_A_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 275      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | -0.00103 |
|    learning_rate      | 0.0001   |
|    n_updates          | 99       |
|    policy_loss        | 0.294    |
|    value_loss         | 0.0872   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 502      |
|    ep_rew_mean        | -10.3    |
| time/                 |          |
|    fps                | 289      |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss

In [56]:
import gym
from stable_baselines3 import A2C
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACLS_df)

# Initialize the A2C model with the training environment
model = A2C("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("a2c_trading_model")

# Load the model for testing
model = A2C.load("a2c_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACLS_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 380      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | -1.02    |
|    learning_rate      | 0.0001   |
|    n_updates          | 99       |
|    policy_loss        | -0.0963  |
|    value_loss         | 0.00892  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 502      |
|    ep_rew_mean        | -91.7    |
| time/                 |          |
|    fps                | 382      |
|    iterations         | 200      |
|    time_elapsed       | 2        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss

In [57]:
import gym
from stable_baselines3 import A2C
import numpy as np

# Assuming the modified SimpleTradingEnv class is already defined
# and we renamed it as SimpleTradingEnv in the previous step.

# Create the environment with training data
train_env = SimpleTradingEnv(train_ACAMU_df)

# Initialize the A2C model with the training environment
model = A2C("MlpPolicy", train_env, verbose=1, learning_rate=1e-4)

# Train the model
model.learn(total_timesteps=1000)

# Save the trained model
model.save("a2c_trading_model")

# Load the model for testing
model = A2C.load("a2c_trading_model")

# Create the environment with testing data
test_env = SimpleTradingEnv(test_ACAMU_df)

# Initialize variables to track total profit and rewards
total_reward = 0
total_profit = 0
n_episodes = 0

obs = test_env.reset()
for step in range(100):
    action, _states = model.predict(obs)

    # If the action logits contain NaNs, print debug information
    if np.isnan(action).any():
        print(f"NaN detected in action logits at step {step}")
        break

    obs, reward, done, info = test_env.step(action)

    # Check for NaNs in observation or reward
    if np.isnan(obs).any() or np.isnan(reward):
        print(f"NaN detected at step {step}")
        break

    total_reward += reward
    profit = test_env.net_worth - 10000  # Assuming 10000 was the initial balance
    total_profit += profit

    test_env.render()

    if done:
        n_episodes += 1
        obs = test_env.reset()

# Calculate average reward and profit
average_reward = total_reward / (step + 1)
average_profit = total_profit / n_episodes if n_episodes > 0 else 0

print(f"Average Reward: {average_reward}")
print(f"Average Profit: {average_profit}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 216      |
|    ep_rew_mean        | 0.768    |
| time/                 |          |
|    fps                | 370      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | 0        |
|    learning_rate      | 0.0001   |
|    n_updates          | 99       |
|    policy_loss        | 0.0195   |
|    value_loss         | 0.000382 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 216      |
|    ep_rew_mean        | 0.289    |
| time/                 |          |
|    fps                | 374      |
|    iterations         | 200      |
|    time_elapsed