In [None]:
import os
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback

# Custom callback for printing during training and collecting reward history
class CustomCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.episode = 0
        self.num_correct = 0
        self.total_steps = 0
        self.current_ep_reward = 0
        self.episode_rewards = []

    def _on_step(self) -> bool:
        # Access locals from the training process
        info = self.locals['infos'][0]
        action = self.locals['actions'][0]
        reward = self.locals['rewards'][0]
        done = self.locals['dones'][0]
        
        # Extract info from environment step
        step = info['step']
        delta = info['delta']
        
        # Map action to string
        action_str = ['Hold', 'Buy', 'Sell'][action]
        
        # Determine price movement
        if abs(delta) < info['threshold']:
            movement = 'stable'
        elif delta > 0:
            movement = 'up'
        else:
            movement = 'down'
        movement_str = f"{movement} ({delta:.2f})"
        
        # Print per step information
        print(f"Step {step}, chosen action {action_str}, next price movement {movement_str}, reward {reward}")
        
        # Update accuracy counters
        self.total_steps += 1
        if reward == 1:
            self.num_correct += 1
        accuracy = (self.num_correct / self.total_steps) * 100
        print(f"Running accuracy: {accuracy:.2f}%")
        
        # Update current episode reward
        self.current_ep_reward += reward
        
        # Check if step is the start of an episode
        if step == 0:
            print(f"Episode {self.episode + 1}")
        
        # Handle episode end
        if done:
            print(f"Episode {self.episode + 1} completed.")
            self.episode_rewards.append(self.current_ep_reward)
            self.current_ep_reward = 0
            self.episode += 1
        
        return True

# Custom Gymnasium environment for BTC/USD trading
class BTCTradingEnv(gym.Env):
    def __init__(self, data):
        super(BTCTradingEnv, self).__init__()
        # Store the data (dropping OpenTime)
        self.data = data.drop(columns=['OpenTime'])
        # Ensure all columns are float
        self.data = self.data.astype(np.float32)
        # Observation space: 9 features
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(9,), dtype=np.float32)
        # Action space: 0=Hold, 1=Buy, 2=Sell
        self.action_space = spaces.Discrete(3)
        # Current step in the episode
        self.current_step = 0

    def reset(self, *, seed=None, options=None):
        # Reset to the beginning of the dataset
        self.current_step = 0
        obs = self._get_obs()
        return obs, {}

    def step(self, action):
        # Get current and next close prices
        current_close = self.data.iloc[self.current_step]['Close']
        next_close = self.data.iloc[self.current_step + 1]['Close']
        delta = next_close - current_close
        
        # Define relative threshold for 'small' change (0.1% of current price)
        threshold = 0.001 * current_close
        
        # Compute reward based on action and price movement
        if action == 0:  # Hold
            reward = 1 if abs(delta) < threshold else -1
        elif action == 1:  # Buy
            reward = 1 if delta > 0 else -1
        elif action == 2:  # Sell
            reward = 1 if delta < 0 else -1
        
        # Prepare info dict for callback/evaluation
        info = {
            'step': self.current_step,
            'delta': delta,
            'threshold': threshold,
            'reward': reward,
            'action': action
        }
        
        # Advance to the next step
        self.current_step += 1
        
        # Check if episode is done (reached end of dataset)
        terminated = self.current_step >= len(self.data) - 1
        truncated = False
        
        # Get next observation if not done
        obs = self._get_obs() if not terminated else np.zeros(self.observation_space.shape, dtype=np.float32)
        
        return obs, reward, terminated, truncated, info

    def _get_obs(self):
        # Return current row as observation
        return self.data.iloc[self.current_step].values

# Function to run evaluation with prints
def evaluate_model(model, vec_env):
    obs = vec_env.reset()
    terminated = False
    total_steps = 0
    num_correct = 0
    print("Starting evaluation episode")
    while not terminated:
        action, _ = model.predict(obs, deterministic=True)
        obs, rewards, term, trunc, infos = vec_env.step(action)
        terminated = term[0] or trunc[0]  # Since single env
        info = infos[0]
        reward = rewards[0]
        delta = info['delta']
        step = info['step']
        action_val = action[0]  # action is array
        
        # Map action to string
        action_str = ['Hold', 'Buy', 'Sell'][action_val]
        
        # Determine price movement
        if abs(delta) < info['threshold']:
            movement = 'stable'
        elif delta > 0:
            movement = 'up'
        else:
            movement = 'down'
        movement_str = f"{movement} ({delta:.2f})"
        
        # Print per step
        print(f"Step {step}, chosen action {action_str}, next price movement {movement_str}, reward {reward}")
        
        # Update accuracy
        total_steps += 1
        if reward == 1:
            num_correct += 1
        accuracy = (num_correct / total_steps) * 100 if total_steps > 0 else 0
        print(f"Running accuracy: {accuracy:.2f}%")
    
    print("Evaluation completed.")
    print(f"Final evaluation accuracy: {accuracy:.2f}%")

# Main script
if __name__ == "__main__":
    # Load the dataset
    df = pd.read_csv('BTC-USD_with_Indicators.csv')
    
    # Create the environment
    env = BTCTradingEnv(df)
    vec_env = DummyVecEnv([lambda: env])
    
    model_path = 'btc_dqn_agent.zip'
    trained = False
    
    if os.path.exists(model_path):
        # Load the model if it exists
        model = DQN.load(model_path, env=vec_env)
        print("Loaded existing model.")
    else:
        # Create and train the model
        model = DQN("MlpPolicy", vec_env, verbose=1)
        callback = CustomCallback()
        
        # Compute total timesteps (e.g., 5 full episodes)
        steps_per_episode = len(df) - 1
        total_timesteps = 5 * steps_per_episode
        
        model.learn(total_timesteps=total_timesteps, callback=callback)
        model.save(model_path)
        print("Trained and saved new model.")
        trained = True
        
        # Plot training reward history
        plt.plot(callback.episode_rewards)
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.title('Training Reward History')
        plt.show()
    
    # Run evaluation (with prints)
    evaluate_model(model, vec_env)