In [None]:
# ####### observation of 50 past lines + stronger penalties
import os
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback

# =========================
# Custom callback for printing during training
# =========================
class CustomCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.episode = 0
        self.num_correct = 0
        self.total_steps = 0
        self.current_ep_reward = 0
        self.episode_rewards = []

    def _on_step(self) -> bool:
        info = self.locals['infos'][0]
        action = self.locals['actions'][0]
        reward = self.locals['rewards'][0]
        done = self.locals['dones'][0]

        step = info['step']
        delta_pct = info['delta_pct'] * 100  # convert to % for readability
        action_str = ['Hold', 'Buy', 'Sell'][action]

        if abs(delta_pct) < info['threshold_pct'] * 100:
            movement = 'stable'
        elif delta_pct > 0:
            movement = 'up'
        else:
            movement = 'down'
        movement_str = f"{movement} ({delta_pct:.2f}%)"

        print(f"Step {step}, chosen action {action_str}, next price movement {movement_str}, reward {reward:.4f}")

        # Track accuracy (reward >= 0 considered correct)
        self.total_steps += 1
        if reward >= 0:
            self.num_correct += 1
        accuracy = (self.num_correct / self.total_steps) * 100
        print(f"Running accuracy: {accuracy:.2f}%")

        self.current_ep_reward += reward

        if step == 0:
            # print(f"Episode {self.episode + 1}")

        if done:
            print(f"Episode {self.episode + 1} completed.")
            self.episode_rewards.append(self.current_ep_reward)
            self.current_ep_reward = 0
            self.episode += 1

        return True


# =========================
# Custom BTC Trading Environment for Gym
# =========================
class BTCTradingEnv(gym.Env):
    def __init__(self, data, lookback_window=12, transaction_cost=0.001):
        super(BTCTradingEnv, self).__init__()
        self.data = data.drop(columns=['OpenTime'])
        self.data = self.data.astype(np.float32)

        self.lookback = lookback_window
        self.transaction_cost = transaction_cost

        obs_shape = (self.lookback * self.data.shape[1],)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=obs_shape, dtype=np.float32)
        self.action_space = spaces.Discrete(3)  # Hold, Buy, Sell

        self.current_step = 0

    def reset(self, *, seed=None, options=None):
        self.current_step = self.lookback  # start after enough history
        obs = self._get_obs()
        return obs, {}

    def step(self, action):
        current_close = self.data.iloc[self.current_step]['Close']
        next_close = self.data.iloc[self.current_step + 1]['Close']
        delta = next_close - current_close
        delta_pct = delta / current_close

        # Threshold for "stable"
        threshold_pct = 0.001

        # Reward system with stronger penalties
        # if action == 0:  # Hold
        #     if abs(delta_pct) < threshold_pct:
        #         reward = 0.05
        #     else:
        #         reward = -2 * abs(delta_pct)  # ⬅️ double penalty for missing moves
        # elif action == 1:  # Buy
        #     reward = (delta_pct - self.transaction_cost) if delta_pct > 0 else 2 * (delta_pct - self.transaction_cost)
        # elif action == 2:  # Sell
        #     reward = (-delta_pct - self.transaction_cost) if delta_pct < 0 else 2 * (-delta_pct - self.transaction_cost)
        if action == 1:  # Buy
            if delta_pct > 0:
                reward = 1
                if delta_pct > 0.01:  # > 1% gain
                    reward = 1.5       # bigger reward for catching strong move
            else:
                reward = -1
                if delta_pct < -0.01:  # > 1% drop
                    reward = -1.5        # harsher penalty
        elif action == 2:  # Sell
            if delta_pct < 0:
                reward = 1
                if delta_pct < -0.01:  # > 1% drop
                    reward = 1.5         # stronger reward
            else:
                reward = -1
            if delta_pct > 0.01:   # > 1% rise
                reward = -1.5        # stronger penalty

        else:  # Hold
            if abs(delta_pct) < threshold_pct:  # very small move
                reward = 0.2   # slight positive for being safe
            elif abs(delta_pct) < 0.01:  # small noise
                reward = 0
            else:  # missed a big move
                reward = -1
       # stronger penalty


        info = {
            'step': self.current_step,
            'delta': delta,
            'delta_pct': delta_pct,
            'threshold_pct': threshold_pct,
            'reward': reward,
            'action': action
        }

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - 1
        truncated = False

        obs = self._get_obs() if not terminated else np.zeros(self.observation_space.shape, dtype=np.float32)
        return obs, reward, terminated, truncated, info

    def _get_obs(self):
        start = self.current_step - self.lookback
        end = self.current_step
        window = self.data.iloc[start:end].values
        return window.flatten().astype(np.float32)


# =========================
# Evaluation function
# =========================
def evaluate_model(model, vec_env):
    obs = vec_env.reset()
    terminated = False
    total_steps = 0
    num_correct = 0
    total_reward = 0

    print("Starting evaluation episode")
    while not terminated:
        action, _ = model.predict(obs, deterministic=True)
        obs, rewards, term, trunc, infos = vec_env.step(action)
        terminated = term[0] or trunc[0]

        info = infos[0]
        reward = rewards[0]
        delta_pct = info['delta_pct'] * 100
        step = info['step']
        action_val = action[0]

        action_str = ['Hold', 'Buy', 'Sell'][action_val]
        if abs(delta_pct) < info['threshold_pct'] * 100:
            movement = 'stable'
        elif delta_pct > 0:
            movement = 'up'
        else:
            movement = 'down'

        print(f"Step {step}, chosen action {action_str}, next price movement {movement} ({delta_pct:.2f}%), reward {reward:.4f}")

        total_steps += 1
        total_reward += reward
        if reward >= 0:
            num_correct += 1

    accuracy = (num_correct / total_steps) * 100 if total_steps > 0 else 0
    print("Evaluation completed.")
    print(f"Final evaluation accuracy: {accuracy:.2f}%")
    print(f"Total cumulative reward: {total_reward:.4f}")


# =========================
# Main
# =========================
if __name__ == "__main__":
    df = pd.read_csv('BTC-USD_with_Indicators.csv')
    env = BTCTradingEnv(df, lookback_window=12, transaction_cost=0.001)  # ⬅️ Window = 50
    vec_env = DummyVecEnv([lambda: env])

    model_path = 'btc_dqn_agent.zip'
    trained = False

    if os.path.exists(model_path):
        model = DQN.load(model_path, env=vec_env)
        print("Loaded existing model.")
    else:
        model = DQN("MlpPolicy", vec_env, verbose=1)
        callback = CustomCallback()

        steps_per_episode = len(df) - 1
        total_timesteps = 50 * steps_per_episode

        model.learn(total_timesteps=total_timesteps, callback=callback)
        model.save(model_path)
        print("Trained and saved new model.")
        trained = True

        plt.plot(callback.episode_rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Training Reward History')
        plt.show()

    evaluate_model(model, vec_env)


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Using cpu device


In [None]:
import os
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
import optuna

# =========================
# Custom callback for printing during training
# =========================
class CustomCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.episode = 0
        self.num_correct = 0
        self.total_steps = 0
        self.current_ep_reward = 0
        self.episode_rewards = []

    def _on_step(self) -> bool:
        info = self.locals['infos'][0]
        action = self.locals['actions'][0]
        reward = self.locals['rewards'][0]
        done = self.locals['dones'][0]

        step = info['step']
        delta_pct = info['delta_pct'] * 100  # convert to % for readability
        action_str = ['Hold', 'Buy', 'Sell'][action]

        if abs(delta_pct) < info['threshold_pct'] * 100:
            movement = 'stable'
        elif delta_pct > 0:
            movement = 'up'
        else:
            movement = 'down'
        movement_str = f"{movement} ({delta_pct:.2f}%)"

        print(f"Step {step}, chosen action {action_str}, next price movement {movement_str}, reward {reward:.4f}")

        # Track accuracy (reward >= 0 considered correct)
        self.total_steps += 1
        if reward >= 0:
            self.num_correct += 1
        accuracy = (self.num_correct / self.total_steps) * 100
        print(f"Running accuracy: {accuracy:.2f}%")

        self.current_ep_reward += reward

        if step == 0:
            print(f"Episode {self.episode + 1}")

        if done:
            print(f"Episode {self.episode + 1} completed.")
            self.episode_rewards.append(self.current_ep_reward)
            self.current_ep_reward = 0
            self.episode += 1

        return True

# =========================
# Custom BTC Trading Environment
# =========================
class BTCTradingEnv(gym.Env):
    def __init__(self, data, lookback_window=12, transaction_cost=0.001):
        super(BTCTradingEnv, self).__init__()
        self.data = data.drop(columns=['OpenTime'])
        self.data = self.data.astype(np.float32)

        self.lookback = lookback_window
        self.transaction_cost = transaction_cost

        obs_shape = (self.lookback * self.data.shape[1],)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=obs_shape, dtype=np.float32)
        self.action_space = spaces.Discrete(3)  # Hold, Buy, Sell

        self.current_step = 0

    def reset(self, *, seed=None, options=None):
        self.current_step = self.lookback  # start after enough history
        obs = self._get_obs()
        return obs, {}

    def step(self, action):
        current_close = self.data.iloc[self.current_step]['Close']
        next_close = self.data.iloc[self.current_step + 1]['Close']
        delta = next_close - current_close
        delta_pct = delta / current_close

        # Threshold for "stable"
        threshold_pct = 0.001

        # Reward system with stronger penalties
        if action == 1:  # Buy
            if delta_pct > 0:
                reward = 1
                if delta_pct > 0.01:  # > 1% gain
                    reward = 1.5       # bigger reward for catching strong move
            else:
                reward = -1
                if delta_pct < -0.01:  # > 1% drop
                    reward = -1.5      # harsher penalty
        elif action == 2:  # Sell
            if delta_pct < 0:
                reward = 1
                if delta_pct < -0.01:  # > 1% drop
                    reward = 1.5      # stronger reward
            else:
                reward = -1
                if delta_pct > 0.01:   # > 1% rise
                    reward = -1.5      # stronger penalty
        else:  # Hold
            if abs(delta_pct) < threshold_pct:  # very small move
                reward = 0.2   # slight positive for being safe
            elif abs(delta_pct) < 0.01:  # small noise
                reward = 0
            else:  # missed a big move
                reward = -1

        info = {
            'step': self.current_step,
            'delta': delta,
            'delta_pct': delta_pct,
            'threshold_pct': threshold_pct,
            'reward': reward,
            'action': action
        }

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - 1
        truncated = False

        obs = self._get_obs() if not terminated else np.zeros(self.observation_space.shape, dtype=np.float32)
        return obs, reward, terminated, truncated, info  # Return 5 values

    def _get_obs(self):
        start = self.current_step - self.lookback
        end = self.current_step
        window = self.data.iloc[start:end].values
        return window.flatten().astype(np.float32)

# =========================
# Evaluation function
# =========================
def evaluate_model(model, vec_env):
    obs = vec_env.reset()
    terminated = False
    total_steps = 0
    num_correct = 0
    total_reward = 0

    print("Starting evaluation episode")
    while not terminated:
        action, _ = model.predict(obs, deterministic=True)
        obs, rewards, dones, infos = vec_env.step(action)  # Adjusted to 4 values
        terminated = dones[0]  # DummyVecEnv returns arrays
        info = infos[0]
        reward = rewards[0]
        action_val = action[0]

        delta_pct = info['delta_pct'] * 100
        step = info['step']
        action_str = ['Hold', 'Buy', 'Sell'][action_val]
        if abs(delta_pct) < info['threshold_pct'] * 100:
            movement = 'stable'
        elif delta_pct > 0:
            movement = 'up'
        else:
            movement = 'down'

        print(f"Step {step}, chosen action {action_str}, next price movement {movement} ({delta_pct:.2f}%), reward {reward:.4f}")

        total_steps += 1
        total_reward += reward
        if reward >= 0:
            num_correct += 1

    accuracy = (num_correct / total_steps) * 100 if total_steps > 0 else 0
    print("Evaluation completed.")
    print(f"Final evaluation accuracy: {accuracy:.2f}%")
    print(f"Total cumulative reward: {total_reward:.4f}")
    return total_reward

# =========================
# Main
# =========================
if __name__ == "__main__":
    df = pd.read_csv('BTC-USD_with_Indicators.csv')
    
    # Split data for training and validation
    train_size = int(0.8 * len(df))
    train_df = df[:train_size]
    val_df = df[train_size:]
    
    env = BTCTradingEnv(train_df, lookback_window=12, transaction_cost=0.001)
    vec_env = DummyVecEnv([lambda: env])
    
    val_env = DummyVecEnv([lambda: BTCTradingEnv(val_df, lookback_window=12, transaction_cost=0.001)])
    
    model_path = 'btc_dqn_agent.zip'
    trained = False

    if os.path.exists(model_path):
        model = DQN.load(model_path, env=vec_env)
        print("Loaded existing model.")
    else:
        def objective(trial):
            learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
            buffer_size = trial.suggest_int("buffer_size", 10000, 1000000, log=True)
            
            model = DQN(
                "MlpPolicy",
                vec_env,
                learning_rate=learning_rate,
                buffer_size=buffer_size,
                verbose=0
            )
            
            steps_per_episode = len(train_df) - 1
            total_timesteps = 3 * steps_per_episode  # 3 steps per episode for trials
            
            model.learn(total_timesteps=total_timesteps)
            
            total_reward = evaluate_model(model, val_env)
            return total_reward
        
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=4)  # 4 trials
        
        best_params = study.best_params
        
        model = DQN(
            "MlpPolicy",
            vec_env,
            **best_params,
            verbose=1
        )
        callback = CustomCallback()

        steps_per_episode = len(train_df) - 1
        total_timesteps = 50 * steps_per_episode

        model.learn(total_timesteps=total_timesteps, callback=callback)
        model.save(model_path)
        print("Trained and saved new model.")
        trained = True

        plt.plot(callback.episode_rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Training Reward history')
        plt.show()

    evaluate_model(model, vec_env)


[I 2025-09-29 11:01:50,027] A new study created in memory with name: no-name-1c395fc7-51a2-42c8-a113-f01810cd04ba


Starting evaluation episode
Step 12, chosen action Buy, next price movement up (0.42%), reward 1.0000
Step 13, chosen action Buy, next price movement up (0.13%), reward 1.0000
Step 14, chosen action Buy, next price movement up (0.28%), reward 1.0000
Step 15, chosen action Buy, next price movement stable (-0.08%), reward -1.0000
Step 16, chosen action Buy, next price movement stable (0.05%), reward 1.0000
Step 17, chosen action Buy, next price movement stable (0.06%), reward 1.0000
Step 18, chosen action Buy, next price movement up (0.37%), reward 1.0000
Step 19, chosen action Buy, next price movement stable (-0.08%), reward -1.0000
Step 20, chosen action Buy, next price movement up (0.81%), reward 1.0000
Step 21, chosen action Buy, next price movement stable (0.04%), reward 1.0000
Step 22, chosen action Buy, next price movement down (-0.33%), reward -1.0000
Step 23, chosen action Buy, next price movement up (2.07%), reward 1.5000
Step 24, chosen action Buy, next price movement up (0.61

[I 2025-09-29 11:05:55,021] Trial 0 finished with value: 165.5 and parameters: {'learning_rate': 0.0004008478033363527, 'buffer_size': 10185}. Best is trial 0 with value: 165.5.


Step 11722, chosen action Buy, next price movement down (-0.64%), reward -1.0000
Step 11723, chosen action Buy, next price movement down (-0.17%), reward -1.0000
Step 11724, chosen action Buy, next price movement up (0.14%), reward 1.0000
Step 11725, chosen action Buy, next price movement up (0.11%), reward 1.0000
Step 11726, chosen action Buy, next price movement stable (0.07%), reward 1.0000
Step 11727, chosen action Buy, next price movement up (0.53%), reward 1.0000
Step 11728, chosen action Buy, next price movement up (0.40%), reward 1.0000
Step 11729, chosen action Buy, next price movement down (-0.33%), reward -1.0000
Step 11730, chosen action Buy, next price movement down (-0.28%), reward -1.0000
Step 11731, chosen action Buy, next price movement down (-0.92%), reward -1.0000
Step 11732, chosen action Buy, next price movement up (0.11%), reward 1.0000
Step 11733, chosen action Buy, next price movement stable (-0.09%), reward -1.0000
Step 11734, chosen action Buy, next price move

[I 2025-09-29 11:10:48,819] Trial 1 finished with value: -148.00054931640625 and parameters: {'learning_rate': 1.3289685424051776e-05, 'buffer_size': 325007}. Best is trial 0 with value: 165.5.


Step 11728, chosen action Sell, next price movement up (0.40%), reward -1.0000
Step 11729, chosen action Sell, next price movement down (-0.33%), reward 1.0000
Step 11730, chosen action Sell, next price movement down (-0.28%), reward 1.0000
Step 11731, chosen action Sell, next price movement down (-0.92%), reward 1.0000
Step 11732, chosen action Hold, next price movement up (0.11%), reward 0.0000
Step 11733, chosen action Hold, next price movement stable (-0.09%), reward 0.2000
Step 11734, chosen action Hold, next price movement up (0.26%), reward 0.0000
Step 11735, chosen action Sell, next price movement down (-0.30%), reward 1.0000
Step 11736, chosen action Sell, next price movement stable (-0.05%), reward 1.0000
Step 11737, chosen action Sell, next price movement stable (0.01%), reward -1.0000
Step 11738, chosen action Sell, next price movement up (0.14%), reward -1.0000
Step 11739, chosen action Sell, next price movement down (-0.16%), reward 1.0000
Step 11740, chosen action Sell, 

[I 2025-09-29 11:15:18,296] Trial 2 finished with value: -158.60012817382812 and parameters: {'learning_rate': 3.418652629271945e-05, 'buffer_size': 159067}. Best is trial 0 with value: 165.5.


Step 11664, chosen action Sell, next price movement stable (0.07%), reward -1.0000
Step 11665, chosen action Sell, next price movement stable (0.08%), reward -1.0000
Step 11666, chosen action Sell, next price movement down (-0.12%), reward 1.0000
Step 11667, chosen action Sell, next price movement up (0.33%), reward -1.0000
Step 11668, chosen action Sell, next price movement up (0.28%), reward -1.0000
Step 11669, chosen action Sell, next price movement up (0.58%), reward -1.0000
Step 11670, chosen action Sell, next price movement up (0.13%), reward -1.0000
Step 11671, chosen action Sell, next price movement down (-0.41%), reward 1.0000
Step 11672, chosen action Sell, next price movement down (-0.25%), reward 1.0000
Step 11673, chosen action Sell, next price movement up (0.14%), reward -1.0000
Step 11674, chosen action Sell, next price movement stable (0.00%), reward -1.0000
Step 11675, chosen action Sell, next price movement up (0.16%), reward -1.0000
Step 11676, chosen action Sell, ne