In [None]:
####### observation of 20  past lines 
import os
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback

# =========================
# Custom callback for printing during training
# =========================
class CustomCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.episode = 0
        self.num_correct = 0
        self.total_steps = 0
        self.current_ep_reward = 0
        self.episode_rewards = []

    def _on_step(self) -> bool:
        info = self.locals['infos'][0]
        action = self.locals['actions'][0]
        reward = self.locals['rewards'][0]
        done = self.locals['dones'][0]

        step = info['step']
        delta_pct = info['delta_pct'] * 100  # convert to % for readability
        action_str = ['Hold', 'Buy', 'Sell'][action]

        if abs(delta_pct) < info['threshold_pct'] * 100:
            movement = 'stable'
        elif delta_pct > 0:
            movement = 'up'
        else:
            movement = 'down'
        movement_str = f"{movement} ({delta_pct:.2f}%)"

        print(f"Step {step}, chosen action {action_str}, next price movement {movement_str}, reward {reward:.4f}")

        # Track accuracy (reward >= 0 considered correct)
        self.total_steps += 1
        if reward >= 0:
            self.num_correct += 1
        accuracy = (self.num_correct / self.total_steps) * 100
        print(f"Running accuracy: {accuracy:.2f}%")

        self.current_ep_reward += reward

        if step == 0:
            print(f"Episode {self.episode + 1}")

        if done:
            print(f"Episode {self.episode + 1} completed.")
            self.episode_rewards.append(self.current_ep_reward)
            self.current_ep_reward = 0
            self.episode += 1

        return True


# =========================
# Custom BTC Trading Environment
# =========================
class BTCTradingEnv(gym.Env):
    def __init__(self, data, lookback_window=20, transaction_cost=0.001):
        super(BTCTradingEnv, self).__init__()
        self.data = data.drop(columns=['OpenTime'])
        self.data = self.data.astype(np.float32)

        self.lookback = lookback_window
        self.transaction_cost = transaction_cost

        obs_shape = (self.lookback * self.data.shape[1],)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=obs_shape, dtype=np.float32)
        self.action_space = spaces.Discrete(3)  # Hold, Buy, Sell

        self.current_step = 0

    def reset(self, *, seed=None, options=None):
        self.current_step = self.lookback  # start after enough history
        obs = self._get_obs()
        return obs, {}

    def step(self, action):
        current_close = self.data.iloc[self.current_step]['Close']
        next_close = self.data.iloc[self.current_step + 1]['Close']
        delta = next_close - current_close
        delta_pct = delta / current_close

        # Threshold for "stable"
        threshold_pct = 0.001

        # Reward system
        if action == 0:  # Hold
            if abs(delta_pct) < threshold_pct:
                reward = 0.05
            else:
                reward = -abs(delta_pct)
        elif action == 1:  # Buy
            reward = delta_pct - self.transaction_cost
        elif action == 2:  # Sell
            reward = -delta_pct - self.transaction_cost

        info = {
            'step': self.current_step,
            'delta': delta,
            'delta_pct': delta_pct,
            'threshold_pct': threshold_pct,
            'reward': reward,
            'action': action
        }

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - 1
        truncated = False

        obs = self._get_obs() if not terminated else np.zeros(self.observation_space.shape, dtype=np.float32)
        return obs, reward, terminated, truncated, info

    def _get_obs(self):
        start = self.current_step - self.lookback
        end = self.current_step
        window = self.data.iloc[start:end].values
        return window.flatten().astype(np.float32)


# =========================
# Evaluation function
# =========================
def evaluate_model(model, vec_env):
    obs = vec_env.reset()
    terminated = False
    total_steps = 0
    num_correct = 0
    total_reward = 0

    print("Starting evaluation episode")
    while not terminated:
        action, _ = model.predict(obs, deterministic=True)
        obs, rewards, term, trunc, infos = vec_env.step(action)
        terminated = term[0] or trunc[0]

        info = infos[0]
        reward = rewards[0]
        delta_pct = info['delta_pct'] * 100
        step = info['step']
        action_val = action[0]

        action_str = ['Hold', 'Buy', 'Sell'][action_val]
        if abs(delta_pct) < info['threshold_pct'] * 100:
            movement = 'stable'
        elif delta_pct > 0:
            movement = 'up'
        else:
            movement = 'down'

        print(f"Step {step}, chosen action {action_str}, next price movement {movement} ({delta_pct:.2f}%), reward {reward:.4f}")

        total_steps += 1
        total_reward += reward
        if reward >= 0:
            num_correct += 1

    accuracy = (num_correct / total_steps) * 100 if total_steps > 0 else 0
    print("Evaluation completed.")
    print(f"Final evaluation accuracy: {accuracy:.2f}%")
    print(f"Total cumulative reward: {total_reward:.4f}")


# =========================
# Main
# =========================
if __name__ == "__main__":
    df = pd.read_csv('BTC-USD_with_Indicators.csv')
    env = BTCTradingEnv(df, lookback_window=20, transaction_cost=0.001)
    vec_env = DummyVecEnv([lambda: env])

    model_path = 'btc_dqn_agent.zip'
    trained = False

    if os.path.exists(model_path):
        model = DQN.load(model_path, env=vec_env)
        print("Loaded existing model.")
    else:
        model = DQN("MlpPolicy", vec_env, verbose=1)
        callback = CustomCallback()

        steps_per_episode = len(df) - 1
        total_timesteps = 5 * steps_per_episode

        model.learn(total_timesteps=total_timesteps, callback=callback)
        model.save(model_path)
        print("Trained and saved new model.")
        trained = True

        plt.plot(callback.episode_rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Training Reward History')
        plt.show()

    evaluate_model(model, vec_env)


Using cpu device
Step 20, chosen action Buy, next price movement down (-1.00%), reward -0.0110
Running accuracy: 0.00%
Step 21, chosen action Buy, next price movement up (0.57%), reward 0.0047
Running accuracy: 50.00%
Step 22, chosen action Buy, next price movement up (0.58%), reward 0.0048
Running accuracy: 66.67%
Step 23, chosen action Hold, next price movement down (-0.10%), reward -0.0010
Running accuracy: 50.00%
Step 24, chosen action Buy, next price movement up (0.60%), reward 0.0050
Running accuracy: 60.00%
Step 25, chosen action Buy, next price movement down (-0.79%), reward -0.0089
Running accuracy: 50.00%
Step 26, chosen action Buy, next price movement up (0.49%), reward 0.0039
Running accuracy: 57.14%
Step 27, chosen action Buy, next price movement down (-0.41%), reward -0.0051
Running accuracy: 50.00%
Step 28, chosen action Sell, next price movement up (0.50%), reward -0.0060
Running accuracy: 44.44%
Step 29, chosen action Sell, next price movement up (1.21%), reward -0.013

In [None]:
####### observation of 50 past lines + stronger penalties
import os
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback

# =========================
# Custom callback for printing during training
# =========================
class CustomCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.episode = 0
        self.num_correct = 0
        self.total_steps = 0
        self.current_ep_reward = 0
        self.episode_rewards = []

    def _on_step(self) -> bool:
        info = self.locals['infos'][0]
        action = self.locals['actions'][0]
        reward = self.locals['rewards'][0]
        done = self.locals['dones'][0]

        step = info['step']
        delta_pct = info['delta_pct'] * 100  # convert to % for readability
        action_str = ['Hold', 'Buy', 'Sell'][action]

        if abs(delta_pct) < info['threshold_pct'] * 100:
            movement = 'stable'
        elif delta_pct > 0:
            movement = 'up'
        else:
            movement = 'down'
        movement_str = f"{movement} ({delta_pct:.2f}%)"

        print(f"Step {step}, chosen action {action_str}, next price movement {movement_str}, reward {reward:.4f}")

        # Track accuracy (reward >= 0 considered correct)
        self.total_steps += 1
        if reward >= 0:
            self.num_correct += 1
        accuracy = (self.num_correct / self.total_steps) * 100
        print(f"Running accuracy: {accuracy:.2f}%")

        self.current_ep_reward += reward

        if step == 0:
            print(f"Episode {self.episode + 1}")

        if done:
            print(f"Episode {self.episode + 1} completed.")
            self.episode_rewards.append(self.current_ep_reward)
            self.current_ep_reward = 0
            self.episode += 1

        return True


# =========================
# Custom BTC Trading Environment
# =========================
class BTCTradingEnv(gym.Env):
    def __init__(self, data, lookback_window=12, transaction_cost=0.001):
        super(BTCTradingEnv, self).__init__()
        self.data = data.drop(columns=['OpenTime'])
        self.data = self.data.astype(np.float32)

        self.lookback = lookback_window
        self.transaction_cost = transaction_cost

        obs_shape = (self.lookback * self.data.shape[1],)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=obs_shape, dtype=np.float32)
        self.action_space = spaces.Discrete(3)  # Hold, Buy, Sell

        self.current_step = 0

    def reset(self, *, seed=None, options=None):
        self.current_step = self.lookback  # start after enough history
        obs = self._get_obs()
        return obs, {}

    def step(self, action):
        current_close = self.data.iloc[self.current_step]['Close']
        next_close = self.data.iloc[self.current_step + 1]['Close']
        delta = next_close - current_close
        delta_pct = delta / current_close

        # Threshold for "stable"
        threshold_pct = 0.001

        # Reward system with stronger penalties
        # if action == 0:  # Hold
        #     if abs(delta_pct) < threshold_pct:
        #         reward = 0.05
        #     else:
        #         reward = -2 * abs(delta_pct)  # ⬅️ double penalty for missing moves
        # elif action == 1:  # Buy
        #     reward = (delta_pct - self.transaction_cost) if delta_pct > 0 else 2 * (delta_pct - self.transaction_cost)
        # elif action == 2:  # Sell
        #     reward = (-delta_pct - self.transaction_cost) if delta_pct < 0 else 2 * (-delta_pct - self.transaction_cost)
        if action == 1:  # Buy
            if delta_pct > 0:
                reward = 1
                if delta_pct > 0.01:  # > 1% gain
                    reward = 1.5       # bigger reward for catching strong move
            else:
                reward = -1
                if delta_pct < -0.01:  # > 1% drop
                    reward = -1.5        # harsher penalty
        elif action == 2:  # Sell
            if delta_pct < 0:
                reward = 1
                if delta_pct < -0.01:  # > 1% drop
                    reward = 1.5         # stronger reward
            else:
                reward = -1
            if delta_pct > 0.01:   # > 1% rise
                reward = -1.5        # stronger penalty

        else:  # Hold
            if abs(delta_pct) < threshold_pct:  # very small move
                reward = 0.2   # slight positive for being safe
            elif abs(delta_pct) < 0.01:  # small noise
                reward = 0
            else:  # missed a big move
                reward = -1
       # stronger penalty


        info = {
            'step': self.current_step,
            'delta': delta,
            'delta_pct': delta_pct,
            'threshold_pct': threshold_pct,
            'reward': reward,
            'action': action
        }

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - 1
        truncated = False

        obs = self._get_obs() if not terminated else np.zeros(self.observation_space.shape, dtype=np.float32)
        return obs, reward, terminated, truncated, info

    def _get_obs(self):
        start = self.current_step - self.lookback
        end = self.current_step
        window = self.data.iloc[start:end].values
        return window.flatten().astype(np.float32)


# =========================
# Evaluation function
# =========================
def evaluate_model(model, vec_env):
    obs = vec_env.reset()
    terminated = False
    total_steps = 0
    num_correct = 0
    total_reward = 0

    print("Starting evaluation episode")
    while not terminated:
        action, _ = model.predict(obs, deterministic=True)
        obs, rewards, term, trunc, infos = vec_env.step(action)
        terminated = term[0] or trunc[0]

        info = infos[0]
        reward = rewards[0]
        delta_pct = info['delta_pct'] * 100
        step = info['step']
        action_val = action[0]

        action_str = ['Hold', 'Buy', 'Sell'][action_val]
        if abs(delta_pct) < info['threshold_pct'] * 100:
            movement = 'stable'
        elif delta_pct > 0:
            movement = 'up'
        else:
            movement = 'down'

        print(f"Step {step}, chosen action {action_str}, next price movement {movement} ({delta_pct:.2f}%), reward {reward:.4f}")

        total_steps += 1
        total_reward += reward
        if reward >= 0:
            num_correct += 1

    accuracy = (num_correct / total_steps) * 100 if total_steps > 0 else 0
    print("Evaluation completed.")
    print(f"Final evaluation accuracy: {accuracy:.2f}%")
    print(f"Total cumulative reward: {total_reward:.4f}")


# =========================
# Main
# =========================
if __name__ == "__main__":
    df = pd.read_csv('BTC-USD_with_Indicators.csv')
    env = BTCTradingEnv(df, lookback_window=12, transaction_cost=0.001)  # ⬅️ Window = 50
    vec_env = DummyVecEnv([lambda: env])

    model_path = 'btc_dqn_agent.zip'
    trained = False

    if os.path.exists(model_path):
        model = DQN.load(model_path, env=vec_env)
        print("Loaded existing model.")
    else:
        model = DQN("MlpPolicy", vec_env, verbose=1)
        callback = CustomCallback()

        steps_per_episode = len(df) - 1
        total_timesteps = 50 * steps_per_episode

        model.learn(total_timesteps=total_timesteps, callback=callback)
        model.save(model_path)
        print("Trained and saved new model.")
        trained = True

        plt.plot(callback.episode_rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Training Reward History')
        plt.show()

    evaluate_model(model, vec_env)


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Using cpu device


In [None]:

import os
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback

# =========================
# Custom callback for printing during training
# =========================
class CustomCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.episode = 0
        self.num_correct = 0
        self.total_steps = 0
        self.current_ep_reward = 0
        self.episode_rewards = []

    def _on_step(self) -> bool:
        info = self.locals['infos'][0]
        action = self.locals['actions'][0]
        reward = self.locals['rewards'][0]
        done = self.locals['dones'][0]

        step = info['step']
        delta_pct = info['delta_pct'] * 100  # Convert to % for readability
        action_str = ['Hold', 'Buy', 'Sell'][action]

        if abs(delta_pct) < info['threshold_pct'] * 100:
            movement = 'stable'
        elif delta_pct > 0:
            movement = 'up'
        else:
            movement = 'down'
        movement_str = f"{movement} ({delta_pct:.2f}%)"

        print(f"Step {step}, chosen action {action_str}, next price movement {movement_str}, reward {reward:.4f}")

        # Track accuracy (reward >= 0 considered correct)
        self.total_steps += 1
        if reward >= 0:
            self.num_correct += 1
        accuracy = (self.num_correct / self.total_steps) * 100
        print(f"Running accuracy: {accuracy:.2f}%")

        self.current_ep_reward += reward

        if step == self.locals['env'].envs[0].lookback:  # Adjust for lookback window
            print(f"Episode {self.episode + 1}")

        if done:
            print(f"Episode {self.episode + 1} completed.")
            self.episode_rewards.append(self.current_ep_reward)
            self.current_ep_reward = 0
            self.episode += 1

        return True

# =========================
# Custom BTC Trading Environment
# =========================
class BTCTradingEnv(gym.Env):
    def __init__(self, data, lookback_window=12, transaction_cost=0.001):
        super(BTCTradingEnv, self).__init__()
        self.data = data.drop(columns=['OpenTime'])
        self.data = self.data.astype(np.float32)

        self.lookback = lookback_window
        self.transaction_cost = transaction_cost

        obs_shape = (self.lookback * self.data.shape[1],)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=obs_shape, dtype=np.float32)
        self.action_space = spaces.Discrete(3)  # Hold, Buy, Sell

        self.current_step = 0
        self.lookback = lookback_window  # Store for use in callback

    def reset(self, *, seed=None, options=None):
        self.current_step = self.lookback  # Start after enough history
        obs = self._get_obs()
        return obs, {}

    def step(self, action):
        current_close = self.data.iloc[self.current_step]['Close']
        next_close = self.data.iloc[self.current_step + 1]['Close']
        delta = next_close - current_close
        delta_pct = delta / current_close

        # Threshold for "stable"
        threshold_pct = 0.001

        if action == 1:  # Buy
            if delta_pct > 0:
                reward = 1
                if delta_pct > 0.01:  # > 1% gain
                    reward = 1.5       # bigger reward for catching strong move
            else:
                reward = -1
                if delta_pct < -0.01:  # > 1% drop
                    reward = -1.5        # harsher penalty
        elif action == 2:  # Sell
            if delta_pct < 0:
                reward = 1
                if delta_pct < -0.01:  # > 1% drop
                    reward = 1.5         # stronger reward
            else:
                reward = -1
            if delta_pct > 0.01:   # > 1% rise
                reward = -1.5        # stronger penalty

        else:  # Hold
            if abs(delta_pct) < threshold_pct:  # very small move
                reward = 0.2   # slight positive for being safe
            elif abs(delta_pct) < 0.01:  # small noise
                reward = 0
            else:  # missed a big move
                reward = -1
        # stronger penalty

        info = {
            'step': self.current_step,
            'delta': delta,
            'delta_pct': delta_pct,
            'threshold_pct': threshold_pct,
            'reward': reward,
            'action': action
        }

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - 2
        truncated = False

        obs = self._get_obs() if not terminated else np.zeros(self.observation_space.shape, dtype=np.float32)
        return obs, reward, terminated, truncated, info

    def _get_obs(self):
        start = self.current_step - self.lookback
        end = self.current_step
        window = self.data.iloc[start:end].values
        return window.flatten().astype(np.float32)

# =========================
# Evaluation function
# =========================
def evaluate_model(model, env):
    obs, _ = env.reset()
    terminated = False
    total_steps = 0
    num_correct = 0
    total_reward = 0

    print("Starting evaluation episode")
    while not terminated:
        action, _ = model.predict(obs, deterministic=True)
        # Handle scalar action
        action = action.item() if isinstance(action, np.ndarray) else action
        obs, reward, terminated, truncated, info = env.step(action)
        terminated = terminated or truncated  # Combine for done condition

        delta_pct = info['delta_pct'] * 100
        step = info['step']
        action_str = ['Hold', 'Buy', 'Sell'][action]

        if abs(delta_pct) < info['threshold_pct'] * 100:
            movement = 'stable'
        elif delta_pct > 0:
            movement = 'up'
        else:
            movement = 'down'

        print(f"Step {step}, chosen action {action_str}, next price movement {movement} ({delta_pct:.2f}%), reward {reward:.4f}")

        total_steps += 1
        total_reward += reward
        if reward >= 0:
            num_correct += 1
        accuracy = (num_correct / total_steps) * 100 if total_steps > 0 else 0
        print(f"Running accuracy: {accuracy:.2f}%")

    print("Evaluation completed.")
    print(f"Final evaluation accuracy: {accuracy:.2f}%")
    print(f"Total cumulative reward: {total_reward:.4f}")

# =========================
# Main
# =========================
if __name__ == "__main__":
    # Load the dataset
    df = pd.read_csv('BTC-USD_with_Indicators.csv')
    env = BTCTradingEnv(df, lookback_window=12, transaction_cost=0.001)
    vec_env = DummyVecEnv([lambda: BTCTradingEnv(df, lookback_window=12, transaction_cost=0.001)])

    model_path = 'btc_dqn_agent.zip'
    trained = False

    if os.path.exists(model_path):
        model = DQN.load(model_path, env=vec_env)
        print("Loaded existing model.")
    else:
        # Define hyperparameters for DQN (manually adjustable)
        dqn_hyperparameters = {
            'learning_rate': 0.0003,  # Step size for gradient updates (default: 0.0005, try: 0.0001 to 0.001)
            'buffer_size': 50000,    # Replay buffer size (default: 1000000, try: 50000 to 500000)
            'learning_starts': 5000, # Steps before training starts (default: 50000, try: 10000 to 100000)
            'batch_size': 32,         # Samples per training update (default: 32, try: 16 to 128)
            'tau': 1.0,               # Target network update coefficient (default: 1.0, try: 0.005 to 0.1 for soft updates)
            'gamma': 0.99,            # Discount factor for future rewards (default: 0.99, try: 0.95 to 0.999)
            'train_freq': 4,          # Steps between training updates (default: 4, try: 1 to 10)
            'gradient_steps': 1,      # Gradient updates per training step (default: 1, try: 1 to 10)
            'exploration_fraction': 0.2,  # Fraction of training for exploration decay (default: 0.1, try: 0.1 to 0.3)
            'exploration_initial_eps': 1.0,  # Initial exploration epsilon (default: 1.0, try: 0.5 to 1.0)
            'exploration_final_eps': 0.05,   # Final exploration epsilon (default: 0.05, try: 0.01 to 0.1)
            'max_grad_norm': 10,      # Gradient clipping value (default: 10, try: 0.5 to 10)
            'target_update_interval': 10000,  # Steps between target network updates (default: 10000, try: 1000 to 50000)
            'policy_kwargs': {'net_arch': [128, 128]},  # Neural network architecture: 2 layers of 128 units (default: [64, 64], try: [64, 64], [256, 256], [128, 128, 128])
            'seed': None,             # Random seed for reproducibility (default: None, try: any integer, e.g., 42)
            'device': 'auto'          # Device for PyTorch ('auto' selects GPU if available, else CPU; try: 'cpu' or 'cuda')
        }

        # Create and train the model with specified hyperparameters
        model = DQN(
            policy="MlpPolicy",
            env=vec_env,
            verbose=1,
            learning_rate=dqn_hyperparameters['learning_rate'],
            buffer_size=dqn_hyperparameters['buffer_size'],
            learning_starts=dqn_hyperparameters['learning_starts'],
            batch_size=dqn_hyperparameters['batch_size'],
            tau=dqn_hyperparameters['tau'],
            gamma=dqn_hyperparameters['gamma'],
            train_freq=dqn_hyperparameters['train_freq'],
            gradient_steps=dqn_hyperparameters['gradient_steps'],
            exploration_fraction=dqn_hyperparameters['exploration_fraction'],
            exploration_initial_eps=dqn_hyperparameters['exploration_initial_eps'],
            exploration_final_eps=dqn_hyperparameters['exploration_final_eps'],
            max_grad_norm=dqn_hyperparameters['max_grad_norm'],
            target_update_interval=dqn_hyperparameters['target_update_interval'],
            policy_kwargs=dqn_hyperparameters['policy_kwargs'],
            seed=dqn_hyperparameters['seed'],
            device=dqn_hyperparameters['device']
        )

        callback = CustomCallback()

        # Compute total timesteps (50 episodes, adjusted for lookback)
        steps_per_episode = len(df) - 1 - 12
        total_timesteps = 20 * steps_per_episode

        model.learn(total_timesteps=total_timesteps, callback=callback)
        model.save(model_path)
        print("Trained and saved new model.")
        trained = True

        plt.plot(callback.episode_rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Training Reward History')
        plt.show()

    evaluate_model(model, env)


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Using cpu device
Step 12, chosen action Hold, next price movement stable (-0.03%), reward 0.2000
Running accuracy: 100.00%
Episode 1
Step 13, chosen action Hold, next price movement down (-0.47%), reward 0.0000
Running accuracy: 100.00%
Step 14, chosen action Hold, next price movement down (-0.43%), reward 0.0000
Running accuracy: 100.00%
Step 15, chosen action Hold, next price movement up (0.30%), reward 0.0000
Running accuracy: 100.00%
Step 16, chosen action Hold, next price movement down (-0.20%), reward 0.0000
Running accuracy: 100.00%
Step 17, chosen action Sell, next price movement down (-0.73%), reward 1.0000
Running accuracy: 100.00%
Step 18, chosen action Hold, next price movement up (1.02%), reward -1.0000
Running accuracy: 85.71%
Step 19, chosen action Hold, next price movement up (0.60%), reward 0.0000
Running accuracy: 87.50%
Step 20, chosen action Hold, next price movement down (-0.29%), reward 0.0000
Running accuracy: 88.89%
Step 21, chosen action Hold, next price moveme