In [3]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from gym import spaces
import requests
import math
import os

# Stable-Baselines3 for production-grade RL algorithms
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor

# -------------------------------
# Data Fetching (Historical Prices)
# -------------------------------
def fetch_historical_data(quote="KRW", target="ETH", hours=5000):
    """
    Fetches hourly candlestick data from an API.
    (In production, you would use robust data ingestion from your vendor.)
    """
    base_url = f"https://api.bithumb.com/public/candlestick/{target}_{quote}/1h"
    try:
        response = requests.get(base_url, timeout=10)
        response.raise_for_status()
        data = response.json()
        
        if data.get("status") != "0000":
            print(f"API Error: {data.get('message', 'Unknown error')}")
            return None
            
        candlesticks = data.get("data", [])
        if not candlesticks:
            print("No hourly data found in response.")
            return None
            
        prices = [float(entry[2]) for entry in candlesticks[-hours:]]
        return np.array(prices, dtype=np.float32)
        
    except Exception as e:
        print(f"Error fetching hourly data: {str(e)}")
        return None

# -------------------------------
# Custom Trading Environment
# -------------------------------
class TradingEnv(gym.Env):
    """
    A custom trading environment for production–grade RL.
    
    Key Features:
      - Continuous Action Space:
          Action ∈ [-1, 1]: 
            >0: Buy (fraction of maximum affordable quantity)
            <0: Sell (fraction of current position)
            0:  Hold
      - Observation: A window of normalized prices plus additional features:
          * normalized balance
          * current position
          * volatility (std. of returns in the window)
      - Reward: Log return on net worth at each step with a long-term bonus (and risk penalty)
    """
    metadata = {"render.modes": ["human"]}
    
    def __init__(self, prices, window_size=30, initial_balance=1e6, longterm_weight=0.1, max_drawdown=0.3):
        super(TradingEnv, self).__init__()
        self.prices = prices
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.longterm_weight = longterm_weight  # bonus weight at the end of the episode
        self.max_drawdown = max_drawdown        # risk constraint: maximum allowed drawdown (as fraction)
        
        # Continuous action: one number in [-1, 1]
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)
        
        # Observation: window_size normalized prices + 3 extra features:
        # [window_size prices, normalized balance, current position, volatility]
        obs_low = -np.inf * np.ones(window_size + 3, dtype=np.float32)
        obs_high = np.inf * np.ones(window_size + 3, dtype=np.float32)
        self.observation_space = spaces.Box(low=obs_low, high=obs_high, dtype=np.float32)
        
        self.reset()
        
    def reset(self):
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.position = 0.0  # asset units held
        self.net_worth = self.initial_balance
        self.max_net_worth = self.initial_balance  # track peak net worth for drawdown
        self.trades = []  # log trades for evaluation
        return self._get_observation()
    
    def _get_observation(self):
        # Use the last window_size prices
        window = self.prices[self.current_step - self.window_size:self.current_step]
        mean_p = np.mean(window)
        std_p = np.std(window) if np.std(window) > 0 else 1.0
        normalized_prices = (window - mean_p) / std_p
        
        # Additional features:
        norm_balance = self.balance / self.initial_balance
        current_position = self.position  # could be normalized by a position limit
        # Estimate volatility from returns in the window
        returns = np.diff(window) / window[:-1] if len(window) > 1 else [0.0]
        volatility = np.std(returns) if len(returns) > 1 else 0.0
        
        obs = np.concatenate([normalized_prices, [norm_balance, current_position, volatility]])
        return obs.astype(np.float32)
    
    def step(self, action):
        """
        Execute one timestep of trading.
        Action is continuous in [-1, 1].  
          If action > 0: Buy fraction of maximum affordable units.
          If action < 0: Sell fraction of current position.
        """
        action_val = np.clip(action[0], -1.0, 1.0)
        current_price = self.prices[self.current_step]
        prev_net_worth = self.net_worth
        
        # Execute action:
        if action_val > 0:  # BUY
            # Determine maximum units affordable
            max_units = self.balance / current_price
            # Buy a fraction equal to action_val (e.g., 0.5 means spend 50% of cash)
            units_bought = action_val * max_units
            self.position += units_bought
            self.balance -= units_bought * current_price
            self.trades.append({"step": self.current_step, "type": "buy", "units": units_bought, "price": current_price})
        elif action_val < 0:  # SELL
            # Sell a fraction of current holdings
            units_sold = abs(action_val) * self.position
            self.position -= units_sold
            self.balance += units_sold * current_price
            self.trades.append({"step": self.current_step, "type": "sell", "units": units_sold, "price": current_price})
        # Else, action_val == 0, so HOLD
        
        # Update net worth and track peak value (for drawdown risk management)
        self.net_worth = self.balance + self.position * current_price
        self.max_net_worth = max(self.max_net_worth, self.net_worth)
        drawdown = (self.max_net_worth - self.net_worth) / self.max_net_worth
        
        # Reward: log return (if possible), penalized by excessive drawdown
        reward = 0.0
        if prev_net_worth > 0:
            reward = np.log(self.net_worth / prev_net_worth)
        
        # Apply risk penalty if drawdown exceeds threshold
        if drawdown > self.max_drawdown:
            reward -= 1.0 * (drawdown - self.max_drawdown)  # risk penalty
        
        self.current_step += 1
        done = (self.current_step >= len(self.prices) - 1)
        
        # At episode end, add a bonus based on overall profit (long-term PnL)
        if done:
            overall_profit = (self.net_worth - self.initial_balance) / self.initial_balance
            reward += self.longterm_weight * overall_profit
        
        obs = self._get_observation()
        info = {"net_worth": self.net_worth, "balance": self.balance, "position": self.position, "drawdown": drawdown}
        return obs, reward, done, info
    
    def render(self, mode="human"):
        print(f"Step: {self.current_step} | Net Worth: {self.net_worth:.2f} | Balance: {self.balance:.2f} | Position: {self.position:.4f}")
    
    def close(self):
        pass

# -------------------------------
# (Optional) Callback for Monitoring Training Progress
# -------------------------------
class NetWorthCallback(BaseCallback):
    """
    A custom callback that logs the net worth at the end of each episode.
    """
    def __init__(self, verbose=0):
        super(NetWorthCallback, self).__init__(verbose)
        self.episode_net_worths = []
    
    def _on_step(self) -> bool:
        return True
    
    def _on_rollout_end(self) -> None:
        # You can extend this callback to log extra metrics
        pass
    
    def _on_episode_end(self):
        # Access the environment info (assumes Monitor wrapper is used)
        if "net_worth" in self.locals.get("infos", [{}])[-1]:
            nw = self.locals["infos"][-1]["net_worth"]
            self.episode_net_worths.append(nw)
            if self.verbose > 0:
                print(f"Episode finished with net worth: {nw:.2f}")

# -------------------------------
# Main: Training and Evaluation Pipeline
# -------------------------------
if __name__ == "__main__":
    # 1. Data Management: Fetch historical data
    coin = "ETH"
    quote = "KRW"
    total_hours = 2000  # adjust as needed (production systems will use much larger datasets)
    prices = fetch_historical_data(quote=quote, target=coin, hours=total_hours)
    if prices is None or len(prices) < total_hours:
        print("Insufficient price data. Exiting.")
        exit(1)
    
    # Optionally, split data into training and test sets.
    # Here, we use the first 75% for training and the remaining 25% for evaluation.
    split_idx = int(0.75 * len(prices))
    train_prices = prices[:split_idx]
    test_prices  = prices[split_idx:]
    
    # 2. Create the Gym environment (wrap with Monitor for logging)
    window_size = 30
    initial_balance = 1e6  # for example, 1,000,000 KRW
    env = TradingEnv(train_prices, window_size=window_size, initial_balance=initial_balance)
    env = Monitor(env)  # wrap for logging and evaluation
    
    # 3. Offline Pre-training or Imitation Learning could be added here
    # For brevity, we proceed directly to RL fine-tuning.
    
    # 4. On-line Fine-Tuning with a production–grade RL algorithm (SAC in this example)
    model = SAC("MlpPolicy", env, verbose=1, tensorboard_log="./sac_trading_tensorboard/")
    
    # Total timesteps should be chosen based on data size and experimentation.
    total_timesteps = 100_000
    networth_callback = NetWorthCallback(verbose=1)
    model.learn(total_timesteps=total_timesteps, callback=networth_callback)
    
    # Save the trained model for production deployment.
    model_path = "sac_trading_model"
    model.save(model_path)
    print(f"Model saved to {model_path}.zip")
    
    # 5. Evaluation & Monitoring on Test Data
    eval_env = TradingEnv(test_prices, window_size=window_size, initial_balance=initial_balance)
    obs = eval_env.reset()
    done = False
    networth_history = [eval_env.net_worth]
    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        networth_history.append(eval_env.net_worth)
    
    print(f"Final net worth on evaluation data: {eval_env.net_worth:.2f}")
    
    # 6. Plot the net worth over the evaluation episode
    plt.figure(figsize=(12, 6))
    plt.plot(networth_history, label="Net Worth")
    plt.title("Evaluation: Net Worth Over Time")
    plt.xlabel("Time Step")
    plt.ylabel("Net Worth")
    plt.legend()
    plt.grid(True)
    plt.show()
    
    # In production, further steps would include:
    #   - Integrating real-time data feeds.
    #   - Risk management overlays (e.g., position limits, max drawdown stops).
    #   - Order execution logic (e.g., VWAP slicing, limit orders).
    #   - Extensive backtesting across multiple market regimes.
    #   - Real-time performance dashboards and alerts.


ModuleNotFoundError: No module named 'gym'