In [4]:
import pandas as pd
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env
import gymnasium as gym
from gymnasium import spaces

In [40]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class MultiAssetTradingEnv(gym.Env):
    metadata = {"render.modes": []}

    def __init__(self, prices_dict, predictions_dict=None,
                 window=24, initial_balance=10000, fee=0.001):
        """
        prices_dict: {'MSFT': np.array, 'AAPL': np.array, ...}
        predictions_dict: {'MSFT': {'t1': ..., 't3': ...}, 'AAPL': {...}, ...}
        """
        super().__init__()

        self.assets = list(prices_dict.keys())
        self.num_assets = len(self.assets)

        self.prices_dict = {k: np.array(v, dtype=np.float32).flatten() for k, v in prices_dict.items()}
        self.predictions_dict = {}
        if predictions_dict is not None:
            for asset, preds in predictions_dict.items():
                self.predictions_dict[asset] = {k: np.array(v, dtype=np.float32).flatten() for k,v in preds.items()}

        self.window = window
        self.initial_balance = initial_balance
        self.fee = fee

        self.balance = {asset: float(initial_balance) for asset in self.assets}
        self.positions = {asset: 0.0 for asset in self.assets}
        self.current_step = window
        self.equity_curve = []

        # MultiDiscrete: –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –∞–∫—Ç–∏–≤–∞ 3 –¥–µ–π—Å—Ç–≤–∏—è (0=HOLD,1=BUY,2=SELL)
        self.action_space = spaces.MultiDiscrete([3]*self.num_assets)

        # –†–∞–∑–º–µ—Ä –Ω–∞–±–ª—é–¥–µ–Ω–∏—è: –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –∞–∫—Ç–∏–≤–∞ window+2+–∫–æ–ª-–≤–æ –ø—Ä–æ–≥–Ω–æ–∑–æ–≤
        n_preds = len(next(iter(self.predictions_dict[self.assets[0]]))) if self.predictions_dict else 0
        obs_dim = self.num_assets * (window + 2 + n_preds)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(195,), dtype=np.float32)

        self.done = False
        self.max_steps = min(len(p) for p in self.prices_dict.values())

    def reset(self, *, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.window
        self.balance = {asset: float(self.initial_balance) for asset in self.assets}
        self.positions = {asset: 0.0 for asset in self.assets}
        self.equity_curve = []
        self.done = False
        obs = self._get_obs()
        return obs, {}

    def _get_obs(self):
        obs = []
        for asset in self.assets:
            start = self.current_step - self.window
            price_window = self.prices_dict[asset][start:self.current_step]
            local_obs = [*price_window, self.balance[asset], self.positions[asset]]
            for arr in self.predictions_dict.get(asset, {}).values():
                val = arr[self.current_step] if self.current_step < len(arr) else 0.0
                local_obs.append(val if np.isfinite(val) else 0.0)
            obs.extend(local_obs)
        return np.array(obs, dtype=np.float32)

    def step(self, actions):
        total_equity = 0.0
        for i, asset in enumerate(self.assets):
            act = actions[i]
            price = self.prices_dict[asset][self.current_step]
            if act == 1:  # BUY
                if self.balance[asset] > 0:
                    shares = self.balance[asset] / (price * (1 + self.fee))
                    self.positions[asset] += shares
                    self.balance[asset] -= shares * price * (1 + self.fee)
            elif act == 2:  # SELL
                if self.positions[asset] > 0:
                    self.balance[asset] += self.positions[asset] * price * (1 - self.fee)
                    self.positions[asset] = 0.0
            total_equity += self.balance[asset] + self.positions[asset] * price

        self.equity_curve.append(total_equity)

        reward = 0.0
        if len(self.equity_curve) > 1:
            reward = (self.equity_curve[-1] - self.equity_curve[-2]) / (self.equity_curve[-2] + 1e-9)

        self.current_step += 1
        self.done = self.current_step >= self.max_steps
        return self._get_obs(), reward, self.done, False, {}

    def get_metrics(self):
        curve = np.array(self.equity_curve, dtype=np.float32)
        if len(curve) == 0:
            return {}
        final_balance = float(curve[-1])
        total_return = (final_balance - self.initial_balance) / self.initial_balance
        returns = np.diff(curve) / curve[:-1]
        sharpe = np.mean(returns) / (np.std(returns) + 1e-9) if len(returns) > 1 else 0.0
        running_max = np.maximum.accumulate(curve)
        max_dd = float(np.min((curve - running_max)/running_max))
        return {"final_balance": final_balance, "total_return": total_return, "sharpe": sharpe, "max_drawdown": max_dd}

    def get_action_stats(self):
        buys = sum([1 for a in self.equity_curve if a > 0])
        sells = sum([1 for a in self.equity_curve if a < 0])
        return {"buys": buys, "sells": sells}


In [41]:
# =====================================
# 2Ô∏è‚É£ –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –ø–æ –Ω–µ—Å–∫–æ–ª—å–∫–∏–º –∞–∫—Ü–∏—è–º
# =====================================
tickers = ["MSFT", "AAPL", "TSLA"]

prices_dict_train = {}
prices_dict_test = {}
predictions_train = {}
predictions_test = {}

test_size = 30 * 24
offset = 300
window = 60

for ticker in tickers:
    df = pd.read_csv(f"models/{ticker}/new_data.csv")
    prices = df["Close"].to_numpy()
    preds = {
        "t1": df["predict_t1"].to_numpy(),
        "t3": df["predict_t3"].to_numpy(),
        "t8": df["predict_t8"].to_numpy(),
    }

    prices_dict_train[ticker] = prices[offset:-test_size]
    prices_dict_test[ticker] = prices[-test_size:]

    predictions_train[ticker] = {k: v[offset:-test_size] for k, v in preds.items()}
    predictions_test[ticker] = {k: v[-test_size:] for k, v in preds.items()}

# =====================================
# 3Ô∏è‚É£ –°–æ–∑–¥–∞—ë–º –æ–∫—Ä—É–∂–µ–Ω–∏—è
# =====================================
env_train = MultiAssetTradingEnv(
    prices_dict=prices_dict_train,
    predictions_dict=predictions_train,
    window=window,
    fee=0.000
)
env_test = MultiAssetTradingEnv(
    prices_dict=prices_dict_test,
    predictions_dict=predictions_test,
    window=window,
    fee=0.001
)

check_env(env_train, warn=True)
check_env(env_test, warn=True)

# =====================================
# 4Ô∏è‚É£ –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
# =====================================
from stable_baselines3 import PPO
model = PPO("MlpPolicy", env_train, verbose=1)


print("üöÄ –ù–∞—á–∏–Ω–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ –º—É–ª—å—Ç–∏–∞–∫—Ç–∏–≤–Ω–æ–π –º–æ–¥–µ–ª–∏...")
model.learn(total_timesteps=500_000)
print("‚úÖ –û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ.")

# =====================================
# 5Ô∏è‚É£ –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ —Å –æ–Ω–ª–∞–π–Ω-–¥–æ–ø–æ–¥—É—á–µ–Ω–∏–µ–º
# =====================================
obs, info = env_test.reset()
done = False
step = 0

while not done:
    action, _ = model.predict(obs, deterministic=False)
    next_obs, reward, done, truncated, info = env_test.step(action)

    model.replay_buffer.add(obs, next_obs, action, reward, done, infos=[info])
    model.train(batch_size=64, gradient_steps=1)

    obs = next_obs
    step += 1

    if step % 500 == 0:
        print(f"–û–Ω–ª–∞–π–Ω –¥–æ–æ–±—É—á–µ–Ω–∏–µ ‚Äî —à–∞–≥ {step}")

print("‚úÖ –¢–µ—Å—Ç –∑–∞–≤–µ—Ä—à—ë–Ω.")

# =====================================
# 6Ô∏è‚É£ –ú–µ—Ç—Ä–∏–∫–∏
# =====================================
metrics = env_test.get_metrics()
print("\n=== üìä TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
üöÄ –ù–∞—á–∏–Ω–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ –º—É–ª—å—Ç–∏–∞–∫—Ç–∏–≤–Ω–æ–π –º–æ–¥–µ–ª–∏...




-----------------------------
| time/              |      |
|    fps             | 140  |
|    iterations      | 1    |
|    time_elapsed    | 14   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.4e+03     |
|    ep_rew_mean          | 0.121       |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 2           |
|    time_elapsed         | 33          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012688924 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.29       |
|    explained_variance   | -25.5       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0116     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.012

AttributeError: 'PPO' object has no attribute 'replay_buffer'

In [42]:
obs, info = env_test.reset()
done = False
step = 0

while not done:
    # –ü–æ–ª—É—á–∞–µ–º –¥–µ–π—Å—Ç–≤–∏–µ –æ—Ç –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏
    action, _ = model.predict(obs, deterministic=True)  # deterministic=True –¥–ª—è —Å—Ç–∞–±–∏–ª—å–Ω–æ–≥–æ —Ç–µ—Å—Ç–∞
    # –î–µ–ª–∞–µ–º —à–∞–≥ –≤ —Å—Ä–µ–¥–µ
    obs, reward, done, truncated, info = env_test.step(action)

    step += 1
    if step % 500 == 0:
        print(f"–¢–µ—Å—Ç ‚Äî —à–∞–≥ {step}")

print("‚úÖ –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ.")

# =====================================
# 6Ô∏è‚É£ –ú–µ—Ç—Ä–∏–∫–∏
# =====================================
metrics = env_test.get_metrics()
print("\n=== üìä TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

# –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ–∫—É–ø–æ–∫ –∏ –ø—Ä–æ–¥–∞–∂ –ø–æ –∫–∞–∂–¥–æ–º—É –∞–∫—Ç–∏–≤—É
print("\nüìà –î–µ–π—Å—Ç–≤–∏—è:", env_test.get_action_stats())

–¢–µ—Å—Ç ‚Äî —à–∞–≥ 500
‚úÖ –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ.

=== üìä TEST METRICS ===
final_balance: 33394.1719
total_return: 2.3394
sharpe: 0.0400
max_drawdown: -0.0528

üìà –î–µ–π—Å—Ç–≤–∏—è: {'buys': 660, 'sells': 0}
