In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

import yfinance as yf
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO
import pandas as pd

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [12]:
class TradingEnv(gym.Env):
    metadata = {"render.modes": []}

    def __init__(self, prices, window=24, initial_balance=10000, fee=0.001):
        super().__init__()
        self.prices = np.array(prices, dtype=np.float32).flatten()
        self.window = window
        self.initial_balance = initial_balance
        self.fee = fee

        self.balance = float(initial_balance)
        self.position = 0.0
        self.current_step = window
        self.equity_curve = []

        # 0=HOLD, 1=BUY, 2=SELL
        self.action_space = spaces.Discrete(3)

        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(window + 2,),
            dtype=np.float32
        )

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.balance = float(self.initial_balance)
        self.position = 0.0
        self.current_step = self.window
        self.equity_curve = []

        self.buy_count = 0
        self.sell_count = 0

        obs = self._get_obs()
        info = {}
        return obs, info

    def _get_obs(self):
        window_data = self.prices[self.current_step - self.window:self.current_step]
        obs = np.concatenate([
            np.array(window_data, dtype=np.float32),
            np.array([self.balance, self.position], dtype=np.float32)
        ])
        return obs

    def step(self, action):
        price = self.prices[self.current_step]

        # BUY (100% баланса)
        if action == 1:
            self.buy_count += 1
            if self.balance > 0:
                shares = self.balance / (price * (1 + self.fee))
                self.position += shares
                self.balance -= shares * price * (1 + self.fee)

        # SELL
        elif action == 2:
            self.sell_count += 1
            if self.position > 0:
                self.balance += self.position * price * (1 - self.fee)
                self.position = 0.0

        # Equity
        equity = self.balance + self.position * price
        self.equity_curve.append(equity)

        # Reward: процент изменения капитала
        reward = 0.0
        if len(self.equity_curve) > 1:
            reward = (equity - self.equity_curve[-2]) / self.equity_curve[-2]

        self.current_step += 1
        terminated = self.current_step >= len(self.prices) - 1
        truncated = False

        obs = self._get_obs()
        info = {}
        return obs, reward, terminated, truncated, info

    def get_metrics(self):
        curve = np.array(self.equity_curve, dtype=np.float32)
        final_balance = float(curve[-1])
        total_return = (final_balance - self.initial_balance) / self.initial_balance

        returns = np.diff(curve) / curve[:-1]
        sharpe = (np.mean(returns) / (np.std(returns) + 1e-9)
                  if len(returns) > 1 else 0.0)

        running_max = np.maximum.accumulate(curve)
        dd = (curve - running_max) / running_max
        max_dd = float(np.min(dd))

        return {
            "final_balance": final_balance,
            "total_return": float(total_return),
            "sharpe": float(sharpe),
            "max_drawdown": max_dd
        }

    def get_action_stats(self):
        return {
            "buys": self.buy_count,
            "sells": self.sell_count
        }



In [13]:
class TradingEnvWithPrediction(gym.Env):
    metadata = {"render.modes": []}

    def __init__(self, prices, predicted_prices=None, window=24, initial_balance=10000, fee=0.001):
        super().__init__()
        self.prices = np.array(prices, dtype=np.float32).flatten()
        self.predicted_prices = (np.array(predicted_prices, dtype=np.float32).flatten()
                                 if predicted_prices is not None else None)
        self.window = window
        self.initial_balance = initial_balance
        self.fee = fee

        self.balance = float(initial_balance)
        self.position = 0.0
        self.current_step = window
        self.equity_curve = []

        self.buy_count = 0
        self.sell_count = 0

        # 0=HOLD, 1=BUY, 2=SELL
        self.action_space = spaces.Discrete(3)

        # Добавляем 1 элемент для предсказанной цены
        obs_dim = window + 2 + (1 if self.predicted_prices is not None else 0)
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(obs_dim,),
            dtype=np.float32
        )

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.balance = float(self.initial_balance)
        self.position = 0.0
        self.current_step = self.window
        self.equity_curve = []
        self.buy_count = 0
        self.sell_count = 0

        obs = self._get_obs()
        info = {}
        return obs, info

    def _get_obs(self):
        window_data = self.prices[self.current_step - self.window:self.current_step]
        obs = [*window_data, self.balance, self.position]

        # Добавляем предсказанную цену текущего шага
        if self.predicted_prices is not None:
            pred_price = self.predicted_prices[self.current_step]
            obs.append(pred_price)

        return np.array(obs, dtype=np.float32)

    def step(self, action):
        price = self.prices[self.current_step]

        if action == 1:  # BUY
            self.buy_count += 1
            if self.balance > 0:
                shares = self.balance / (price * (1 + self.fee))
                self.position += shares
                self.balance -= shares * price * (1 + self.fee)

        elif action == 2:  # SELL
            self.sell_count += 1
            if self.position > 0:
                self.balance += self.position * price * (1 - self.fee)
                self.position = 0.0

        equity = self.balance + self.position * price
        self.equity_curve.append(equity)

        reward = 0.0
        if len(self.equity_curve) > 1:
            reward = (equity - self.equity_curve[-2]) / self.equity_curve[-2]

        self.current_step += 1
        terminated = self.current_step >= len(self.prices) - 1
        truncated = False

        obs = self._get_obs()
        info = {}
        return obs, reward, terminated, truncated, info

    def get_metrics(self):
        curve = np.array(self.equity_curve, dtype=np.float32)
        final_balance = float(curve[-1])
        total_return = (final_balance - self.initial_balance) / self.initial_balance

        returns = np.diff(curve) / curve[:-1]
        sharpe = (np.mean(returns) / (np.std(returns) + 1e-9)
                  if len(returns) > 1 else 0.0)

        running_max = np.maximum.accumulate(curve)
        dd = (curve - running_max) / running_max
        max_dd = float(np.min(dd))

        return {
            "final_balance": final_balance,
            "total_return": float(total_return),
            "sharpe": float(sharpe),
            "max_drawdown": max_dd
        }

    def get_action_stats(self):
        return {
            "buys": self.buy_count,
            "sells": self.sell_count
        }


In [14]:
ticker = "BTC-USD"
data = yf.download(ticker, period="2y", interval="1h")
prices = data["Close"].to_numpy().flatten()
print("Размер массива цен:", prices.shape)

# --- Train/Test Split ---
test_size = 60 * 24  # последний месяц
train_prices = prices[500:-test_size]
test_prices = prices[-test_size:]

# --- Окружения ---
window = 24
env_train = TradingEnv(train_prices, window=window, fee=0.000)  # обучение без комиссии
env_test  = TradingEnv(test_prices, window=window, fee=0.001)    # тест с реальной комиссией

check_env(env_train, warn=True)
check_env(env_test, warn=True)

# --- Обучение ---
model = PPO(
    "MlpPolicy",
    env_train,
    verbose=1,
    learning_rate=3e-4,
    batch_size=64,
    n_steps=2048,
    gamma=0.99
)
model.learn(total_timesteps=200_000)

# --- Тест ---
obs, info = env_test.reset()
done = False
while not done:
    action, _ = model.predict(obs)
    obs, reward, done, truncated, info = env_test.step(action)

metrics = env_test.get_metrics()
stats = env_test.get_action_stats()

print("\n=== TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v}")

print("\n=== ACTION STATS ===")
print(f"Buys:  {stats['buys']}")
print(f"Sells: {stats['sells']}")

  data = yf.download(ticker, period="2y", interval="1h")
[*********************100%***********************]  1 of 1 completed


Размер массива цен: (17395,)
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 260  |
|    iterations      | 1    |
|    time_elapsed    | 7    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 213         |
|    iterations           | 2           |
|    time_elapsed         | 19          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011934064 |
|    clip_fraction        | 0.102       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -8.16       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00718    |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0057 

In [37]:
ticker = "TSLA"

In [38]:
data = yf.download(ticker, period="2y", interval="1h")
prices = data["Close"].to_numpy().flatten()
print("Размер массива цен:", prices.shape)

  data = yf.download(ticker, period="2y", interval="1h")
[*********************100%***********************]  1 of 1 completed

Размер массива цен: (3494,)





In [21]:
data = pd.read_csv('data.csv')

In [None]:
import yfinance as yf
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env

# --- Загрузка данных ---
# ticker = "APPL"
data = yf.download(ticker, period="2y", interval="1h")
prices = data["Close"].to_numpy().flatten()
print("Размер массива цен:", prices.shape)

# --- Train/Test Split ---
test_size = 30 * 24  # последний месяц
train_prices = prices[500:-test_size]
test_prices = prices[-test_size:]

# --- Окружения ---
window = 40
env_train = TradingEnv(train_prices, window=window, fee=0.000)
env_test  = TradingEnv(test_prices, window=window, fee=0.001)

check_env(env_train, warn=True)
check_env(env_test, warn=True)

# --- Обучение основной модели ---
model = DQN(
    "MlpPolicy",
    env_train,
    verbose=1,
    learning_rate=1e-4,
    buffer_size=50_000,
    batch_size=64,
    gamma=0.99,
    train_freq=4,
    target_update_interval=1000,
)
model.learn(total_timesteps=200_000)

# --- Тест с онлайн-дообучением ---
obs, info = env_test.reset()
done = False
step = 0

while not done:
    # выбираем действие
    action, _ = model.predict(obs, deterministic=False)
    next_obs, reward, done, truncated, info = env_test.step(action)

    # добавляем новый опыт в буфер
    model.replay_buffer.add(obs, next_obs, action, reward, done, infos=[info])

    # дообучаем на новых данных
    model.train(batch_size=64, gradient_steps=1)

    obs = next_obs
    step += 1

    if step % 500 == 0:
        print(f"Онлайн-дообучение — шаг {step}")

# --- Метрики ---
metrics = env_test.get_metrics()
print("\n=== TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v}")


  data = yf.download(ticker, period="2y", interval="1h")
[*********************100%***********************]  1 of 1 completed


Размер массива цен: (17395,)
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.61e+04 |
|    ep_rew_mean      | 0.454    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 477      |
|    time_elapsed     | 135      |
|    total_timesteps  | 64536    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 27.4     |
|    n_updates        | 16108    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.61e+04 |
|    ep_rew_mean      | 0.535    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 463      |
|    time_elapsed     | 278      |
|    total_timesteps  | 129072   |
|

: 

In [16]:
import yfinance as yf
import numpy as np
import pandas as pd
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env

# --- Загружаем данные ---
data = pd.read_csv('data_.csv')  # предполагается, что есть колонки Close, predicted_close
prices = data['Close'].to_numpy()
pred_prices = data['Predicted_Close'].to_numpy()  # предсказанные цены

# --- Train/Test Split ---
test_size = 30 * 24  # последний месяц
train_prices = prices[500:-test_size]
train_pred   = pred_prices[500:-test_size]

test_prices = prices[-test_size:]
test_pred   = pred_prices[-test_size:]

# --- Окружения ---
window = 60
env_train = TradingEnvWithPrediction(train_prices, predicted_prices=train_pred, window=window, fee=0.000)
env_test  = TradingEnvWithPrediction(test_prices, predicted_prices=test_pred, window=window, fee=0.001)

check_env(env_train, warn=True)
check_env(env_test, warn=True)

# --- Обучение модели ---
model = DQN(
    "MlpPolicy",
    env_train,
    verbose=1,
    learning_rate=1e-4,
    buffer_size=50_000,
    batch_size=64,
    gamma=0.99,
    train_freq=4,
    target_update_interval=1000,
)
model.learn(total_timesteps=200_000)

# --- Тест с онлайн-дообучением ---
obs, info = env_test.reset()
done = False
step = 0

while not done:
    # выбираем действие
    action, _ = model.predict(obs, deterministic=False)
    next_obs, reward, done, truncated, info = env_test.step(action)

    # добавляем новый опыт в буфер
    model.replay_buffer.add(obs, next_obs, action, reward, done, infos=[info])

    # дообучаем на новых данных
    model.train(batch_size=64, gradient_steps=1)

    obs = next_obs
    step += 1

    if step % 500 == 0:
        print(f"Онлайн-дообучение — шаг {step}")

# --- Метрики ---
metrics = env_test.get_metrics()
print("\n=== TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.21e+03 |
|    ep_rew_mean      | 0.114    |
|    exploration_rate | 0.58     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 599      |
|    time_elapsed     | 14       |
|    total_timesteps  | 8848     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.234    |
|    n_updates        | 2186     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.21e+03 |
|    ep_rew_mean      | 0.0892   |
|    exploration_rate | 0.159    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 469      |
|    time_elapsed     | 37       |
|    total_timesteps  | 17696    |
| train/              |       

In [14]:
data

Unnamed: 0.1,Unnamed: 0,Ticker,Open,High,Low,Close,Volume,Next_Close,Predicted_Close,price_t_plus_3,Predicted_Close_t_3
0,0,TSLA,223.149994,224.149994,217.860001,218.259995,30510256.0,220.460297,,219.990005,
1,1,TSLA,218.289902,220.660004,218.000000,220.460297,18283175.0,218.380005,,220.949997,
2,2,TSLA,220.449997,221.237503,218.050003,218.380005,14518568.0,219.990005,,221.940002,
3,3,TSLA,218.410004,220.149994,217.639999,219.990005,11122870.0,220.949997,,222.100006,
4,4,TSLA,220.000000,221.460007,219.880005,220.949997,10069584.0,221.940002,,213.980103,
...,...,...,...,...,...,...,...,...,...,...,...
3488,3488,TSLA,434.225006,434.859985,426.290009,428.239990,10041263.0,428.743500,438.955597,432.405914,433.686157
3489,3489,TSLA,428.190002,429.279999,421.880005,428.743500,9926006.0,429.807587,439.668732,431.076599,431.694794
3490,3490,TSLA,428.755005,429.890015,424.440002,429.807587,6341342.0,432.405914,431.881012,,425.598785
3491,3491,TSLA,429.809906,433.890015,429.100006,432.405914,6470793.0,431.076599,432.385529,,426.487488


In [8]:
test_size

1440

In [13]:
print("\n=== ACTION STATS ===")
stats = env_test.get_action_stats()
print(f"Buys:  {stats['buys']}")
print(f"Sells: {stats['sells']}")


=== ACTION STATS ===
Buys:  64
Sells: 312


In [11]:
class TradingEnvWithPrediction(gym.Env):
    metadata = {"render.modes": []}

    def __init__(self, prices, predicted_prices_1h=None, predicted_prices_3h=None,
                 window=24, initial_balance=10000, fee=0.001):
        super().__init__()
        self.prices = np.array(prices, dtype=np.float32).flatten()
        self.predicted_prices_1h = (np.array(predicted_prices_1h, dtype=np.float32).flatten()
                                    if predicted_prices_1h is not None else None)
        self.predicted_prices_3h = (np.array(predicted_prices_3h, dtype=np.float32).flatten()
                                    if predicted_prices_3h is not None else None)
        self.window = window
        self.initial_balance = initial_balance
        self.fee = fee

        self.balance = float(initial_balance)
        self.position = 0.0
        self.current_step = window
        self.equity_curve = []

        self.buy_count = 0
        self.sell_count = 0

        # 0=HOLD, 1=BUY, 2=SELL
        self.action_space = spaces.Discrete(3)

        # размер наблюдения: окно + баланс + позиция + 2 прогноза (если есть)
        obs_dim = window + 2
        if self.predicted_prices_1h is not None:
            obs_dim += 1
        if self.predicted_prices_3h is not None:
            obs_dim += 1

        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(obs_dim,),
            dtype=np.float32
        )

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.balance = float(self.initial_balance)
        self.position = 0.0
        self.current_step = self.window
        self.equity_curve = []
        self.buy_count = 0
        self.sell_count = 0

        obs = self._get_obs()
        info = {}
        return obs, info

    def _get_obs(self):
        window_data = self.prices[self.current_step - self.window:self.current_step]
        obs = [*window_data, self.balance, self.position]

        if self.predicted_prices_1h is not None:
            obs.append(self.predicted_prices_1h[self.current_step])
        if self.predicted_prices_3h is not None:
            obs.append(self.predicted_prices_3h[self.current_step])

        return np.array(obs, dtype=np.float32)

    def step(self, action):
        price = self.prices[self.current_step]

        if action == 1:  # BUY
            self.buy_count += 1
            if self.balance > 0:
                shares = self.balance / (price * (1 + self.fee))
                self.position += shares
                self.balance -= shares * price * (1 + self.fee)

        elif action == 2:  # SELL
            self.sell_count += 1
            if self.position > 0:
                self.balance += self.position * price * (1 - self.fee)
                self.position = 0.0

        equity = self.balance + self.position * price
        self.equity_curve.append(equity)

        reward = 0.0
        if len(self.equity_curve) > 1:
            reward = (equity - self.equity_curve[-2]) / (self.equity_curve[-2] + 1e-9)

        self.current_step += 1
        terminated = self.current_step >= len(self.prices) - 1
        truncated = False

        return self._get_obs(), reward, terminated, truncated, {}

    def get_metrics(self):
        curve = np.array(self.equity_curve, dtype=np.float32)

        final_balance = float(curve[-1])
        total_return = (final_balance - self.initial_balance) / self.initial_balance

        returns = np.diff(curve) / curve[:-1]
        sharpe = (
            np.mean(returns) / (np.std(returns) + 1e-9)
            if len(returns) > 1 else 0.0
        )

        running_max = np.maximum.accumulate(curve)
        max_dd = float(np.min((curve - running_max) / running_max))

        return {
            "final_balance": final_balance,
            "total_return": total_return,
            "sharpe": sharpe,
            "max_drawdown": max_dd
        }

    def get_action_stats(self):
        return {
            "buys": self.buy_count,
            "sells": self.sell_count
        }


In [12]:
import yfinance as yf
import numpy as np
import pandas as pd
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env

# --- Загружаем данные ---
data = pd.read_csv('data_.csv')  # предполагается, что есть колонки Close, Predicted_Close, Predicted_Close_3h
prices = data['Close'].to_numpy()
pred_prices_1h = data['Predicted_Close'].to_numpy()       # предсказание на 1 час вперед
pred_prices_3h = data['Predicted_Close_t_3'].to_numpy()    # предсказание на 3 часа вперед

# --- Train/Test Split ---
test_size = 30 * 24  # последний месяц
train_prices      = prices[500:-test_size]
train_pred_1h     = pred_prices_1h[500:-test_size]
train_pred_3h     = pred_prices_3h[500:-test_size]

test_prices       = prices[-test_size:]
test_pred_1h      = pred_prices_1h[-test_size:]
test_pred_3h      = pred_prices_3h[-test_size:]

# --- Окружения ---
window = 60
env_train = TradingEnvWithPrediction(
    prices=train_prices,
    predicted_prices_1h=train_pred_1h,
    predicted_prices_3h=train_pred_3h,
    window=window,
    fee=0.000
)
env_test = TradingEnvWithPrediction(
    prices=test_prices,
    predicted_prices_1h=test_pred_1h,
    predicted_prices_3h=test_pred_3h,
    window=window,
    fee=0.001
)

check_env(env_train, warn=True)
check_env(env_test, warn=True)

# --- Обучение модели ---
model = DQN(
    "MlpPolicy",
    env_train,
    verbose=1,
    learning_rate=1e-4,
    buffer_size=50_000,
    batch_size=64,
    gamma=0.99,
    train_freq=4,
    target_update_interval=1000,
)
model.learn(total_timesteps=200_000)

# --- Тест с онлайн-дообучением ---
obs, info = env_test.reset()
done = False
step = 0

while not done:
    action, _ = model.predict(obs, deterministic=False)
    next_obs, reward, done, truncated, info = env_test.step(action)

    # добавляем новый опыт в буфер
    model.replay_buffer.add(obs, next_obs, action, reward, done, infos=[info])

    # дообучаем на новых данных
    model.train(batch_size=64, gradient_steps=1)

    obs = next_obs
    step += 1

    if step % 500 == 0:
        print(f"Онлайн-дообучение — шаг {step}")

# --- Метрики ---
metrics = env_test.get_metrics()
print("\n=== TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.21e+03 |
|    ep_rew_mean      | 0.492    |
|    exploration_rate | 0.58     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 639      |
|    time_elapsed     | 13       |
|    total_timesteps  | 8848     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.48     |
|    n_updates        | 2186     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2.21e+03 |
|    ep_rew_mean      | 0.134    |
|    exploration_rate | 0.159    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 581      |
|    time_elapsed     | 30       |
|    total_timesteps  | 17696    |
| train/              |       

In [2]:
class TradingEnvWithPrediction(gym.Env):
    metadata = {"render.modes": []}

    def __init__(self, prices, predictions=None,
                 window=24, initial_balance=10000, fee=0.001):
        """
        prices: np.array или список цен
        predictions: dict, где ключ — название прогноза, значение — массив предсказаний
                     пример: {"t1": predict_t1, "t3": predict_t3, "t8": predict_t8}
        """
        super().__init__()

        self.prices = np.array(prices, dtype=np.float32).flatten()
        self.predictions = {}

        if predictions is not None:
            for name, arr in predictions.items():
                self.predictions[name] = np.array(arr, dtype=np.float32).flatten()

        self.window = window
        self.initial_balance = initial_balance
        self.fee = fee

        self.balance = float(initial_balance)
        self.position = 0.0
        self.current_step = window
        self.equity_curve = []

        self.buy_count = 0
        self.sell_count = 0

        # --- Actions: 0 = HOLD, 1 = BUY, 2 = SELL ---
        self.action_space = spaces.Discrete(3)

        # --- Observation size ---
        obs_dim = window + 2 + len(self.predictions)
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(obs_dim,),
            dtype=np.float32
        )
        

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.balance = float(self.initial_balance)
        self.position = 0.0
        self.current_step = self.window
        self.equity_curve = []
        self.buy_count = 0
        self.sell_count = 0

        obs = self._get_obs()
        info = {}
        return obs, info

    def _get_obs(self):
        window_data = self.prices[self.current_step - self.window:self.current_step]
        obs = [*window_data, self.balance, self.position]

        # Добавляем все прогнозы
        for arr in self.predictions.values():
            if self.current_step < len(arr):
                obs.append(arr[self.current_step])
            else:
                obs.append(np.nan)  # если вышли за пределы массива

        return np.array(obs, dtype=np.float32)

    def step(self, action):
        price = self.prices[self.current_step]
        prev_equity = self.balance + self.position * price
        
    
        # print(f"Step {self.current_step}: Price={price:.4f}, Balance={self.balance:.2f}, "
        #   f"Position={self.position:.4f}, Equity={prev_equity:.2f}")
    
        # --- Торговая логика с проверками ---
        trade_executed = False
        
        if action == 1:  # BUY
            if self.balance > 0:
                shares = self.balance / (price * (1 + self.fee))
                self.position += shares
                self.balance -= shares * price * (1 + self.fee)
                self.buy_count += 1
                trade_executed = True
            # else: можно добавить логирование попытки покупки без средств
        
        elif action == 2:  # SELL
            if self.position > 0:
                self.balance += self.position * price * (1 - self.fee)
                self.position = 0.0
                self.sell_count += 1
                trade_executed = True
            # else: можно добавить штраф за попытку продажи без позиции
        
        # --- Награда и метрики ---
        current_equity = self.balance + self.position * price
        # print(f"After action {action}: Equity={current_equity:.2f}, Change={current_equity-prev_equity:.2f}")
        self.equity_curve.append(current_equity)
        
        reward = 0.0
        if len(self.equity_curve) > 1:
            reward = (current_equity - prev_equity) / (prev_equity + 1e-9)
            
        price = self.prices[self.current_step]

        prev_price = self.prices[self.current_step - 1]

        reward = 0

        if self.position > 0:
            reward = (price - prev_price) / prev_price
        # Штраф за бесполезные действия
        # if not trade_executed and action != 0:  # действие не выполнено и это не HOLD
        #     reward -= 0.001  # небольшой штраф
        
        self.current_step += 1
        terminated = self.current_step >= len(self.prices) - 1
        
        return self._get_obs(), reward, terminated, False, {}

    def get_metrics(self):
        curve = np.array(self.equity_curve, dtype=np.float32)
        if len(curve) == 0:
            return {}

        final_balance = float(curve[-1])
        total_return = (final_balance - self.initial_balance) / self.initial_balance

        returns = np.diff(curve) / curve[:-1]
        sharpe = (
            np.mean(returns) / (np.std(returns) + 1e-9)
            if len(returns) > 1 else 0.0
        )

        running_max = np.maximum.accumulate(curve)
        max_dd = float(np.min((curve - running_max) / running_max))

        return {
            "final_balance": final_balance,
            "total_return": total_return,
            "sharpe": sharpe,
            "max_drawdown": max_dd
        }

    def get_action_stats(self):
        return {
            "buys": self.buy_count,
            "sells": self.sell_count
        }

In [3]:
import numpy as np
import pandas as pd
import yfinance as yf
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import SAC


In [None]:




# === 2. Загружаем данные ===
data = pd.read_csv("models/models/new_data.csv")  # должен содержать колонки: Close, predict_t1, predict_t3, predict_t8

prices = data["Close"].to_numpy()
pred_t1 = data["predict_t1"].to_numpy()
pred_t3 = data["predict_t3"].to_numpy()
pred_t8 = data["predict_t8"].to_numpy()

# === 3. Train/Test Split ===
test_size = 90 * 24  # например, 30 дней по 24 часа

train_prices = prices[500:-test_size]
train_preds = {
    "t1": pred_t1[500:-test_size],
    "t3": pred_t3[500:-test_size],
    "t8": pred_t8[500:-test_size],
}

test_prices = prices[-test_size:]
test_preds = {
    "t1": pred_t1[-test_size:],
    "t3": pred_t3[-test_size:],
    "t8": pred_t8[-test_size:],
}

# === 4. Создаём окружения ===
window = 60

env_train = TradingEnvWithPrediction(
    prices=train_prices,
    predictions=train_preds,
    window=window,
    fee=0.000
)

env_test = TradingEnvWithPrediction(
    prices=test_prices,
    predictions=test_preds,
    window=window,
    fee=0.001
)

# Проверим корректность среды
check_env(env_train, warn=True)
check_env(env_test, warn=True)

# === 5. Обучаем DQN ===
model = DQN(
    "MlpPolicy",
    env_train,
    verbose=1,
    learning_rate=1e-4,
    buffer_size=50_000,
    batch_size=64,
    gamma=0.99,
    train_freq=4,
    target_update_interval=1000,
)

# model = DQN.load("models/AAPL/dqn_trading_model", env=env_train)
print("🚀 Начинаем обучение модели...")
model.learn(total_timesteps=400_000)
print("✅ Обучение завершено.")

# === 6. Тест с онлайн-доподучением ===
obs, info = env_test.reset()
done = False
step = 0

while not done:
    action, _ = model.predict(obs, deterministic=False)
    next_obs, reward, done, truncated, info = env_test.step(action)

    # Добавляем опыт в буфер
    model.replay_buffer.add(obs, next_obs, action, reward, done, infos=[info])

    # Дообучаем на новых данных (онлайн)
    model.train(batch_size=64, gradient_steps=1)

    obs = next_obs
    step += 1

    if step % 500 == 0:
        print(f"Онлайн дообучение — шаг {step}")

print("✅ Тест завершён.")

# === 7. Метрики ===
metrics = env_test.get_metrics()
print("\n=== 📊 TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

print("\n📈 Действия:", env_test.get_action_stats())


# obs, info = env_test.reset()
# done = False
# step = 0

# # Инициализация логгера
# from stable_baselines3.common.logger import configure
# model._logger = configure("logs/", ["stdout"])
# model._current_progress_remaining = 1.0

# while not done:
#     action, _ = model.predict(obs, deterministic=False)
#     next_obs, reward, done, truncated, info = env_test.step(action)

#     # Добавляем новый опыт в буфер
#     model.replay_buffer.add(obs, next_obs, action, reward, done, infos=[info])

#     # Дообучаем на новых данных
#     model.train(batch_size=64, gradient_steps=1)

#     obs = next_obs
#     step += 1

#     if step % 500 == 0:
#         print(f"Онлайн-дообучение — шаг {step}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
🚀 Начинаем обучение модели...
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.47e+04 |
|    ep_rew_mean      | 0.369    |
|    exploration_rate | 0.0713   |
| time/               |          |
|    episodes         | 4        |
|    fps              | 453      |
|    time_elapsed     | 129      |
|    total_timesteps  | 58652    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 23.8     |
|    n_updates        | 14637    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.47e+04 |
|    ep_rew_mean      | 0.157    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 405      |
|    time_elapsed     | 289      |
|    total_timesteps  | 117304   |


In [9]:
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

# === 2. Загружаем данные ===
data = pd.read_csv("models/models/new_data.csv")  # должен содержать колонки: Close, predict_t1, predict_t3, predict_t8

prices = data["Close"].to_numpy()
pred_t1 = data["predict_t1"].to_numpy()
pred_t3 = data["predict_t3"].to_numpy()
pred_t8 = data["predict_t8"].to_numpy()

# === 3. Train/Test Split ===
test_size = 60 * 24  # например, 30 дней по 24 часа

train_prices = prices[500:-test_size]
train_preds = {
    "t1": pred_t1[500:-test_size],
    "t3": pred_t3[500:-test_size],
    "t8": pred_t8[500:-test_size],
}

test_prices = prices[-test_size:]
test_preds = {
    "t1": pred_t1[-test_size:],
    "t3": pred_t3[-test_size:],
    "t8": pred_t8[-test_size:],
}

# === 4. Создаём окружения ===
window = 60

env_train = TradingEnvSAC(
    prices=train_prices,
    predictions=train_preds,
    window=window,
    fee=0.000
)

env_test = TradingEnvSAC(
    prices=test_prices,
    predictions=test_preds,
    window=window,
    fee=0.001
)

# Проверим корректность среды
check_env(env_train, warn=True)
check_env(env_test, warn=True)

# === 5. Обучаем PPO ===
model = PPO(
    "MlpPolicy",
    env_train,
    verbose=1,
    learning_rate=3e-4,
    n_steps=2048,       # длина траектории
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    clip_range=0.2
)

print("🚀 Начинаем обучение модели PPO...")
model.learn(total_timesteps=400_000)
print("✅ Обучение завершено.")

# === 6. Тестирование ===
obs, info = env_test.reset()
done = False
step = 0

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env_test.step(action)
    step += 1

print("✅ Тест завершён.")

# === 7. Метрики ===
metrics = env_test.get_metrics()
print("\n=== 📊 TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")




Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
🚀 Начинаем обучение модели PPO...




-----------------------------
| time/              |      |
|    fps             | 310  |
|    iterations      | 1    |
|    time_elapsed    | 6    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 250          |
|    iterations           | 2            |
|    time_elapsed         | 16           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0035340018 |
|    clip_fraction        | 0.0282       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.42        |
|    explained_variance   | -3.13        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0111      |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00144     |
|    std                  | 1            |
|    value_loss           | 0.00333      |
----------------

In [11]:

# === 7. Метрики ===
metrics = env_test.get_metrics()
print("\n=== 📊 TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

# print("\n📈 Действия:", env_test.get_action_stats())


=== 📊 TEST METRICS ===
final_equity: 0.7158
final_position: 1.0000


In [7]:
import gymnasium as gym
import numpy as np


class TradingEnvSAC(gym.Env):
    """
    Continuous-action trading environment for SAC.
    Action = [-1 ... +1]  → position size (fraction of equity).
    """

    metadata = {"render.modes": ["human"]}

    def __init__(self, prices, predictions, window=60, fee=0.001):
        super().__init__()

        # ====== SANITIZE INPUT DATA ======
        prices = np.nan_to_num(prices, nan=1.0, posinf=1.0, neginf=1.0)
        self.prices = prices.astype(np.float32)

        self.pred_t1 = np.nan_to_num(predictions["t1"], nan=0.0).astype(np.float32)
        self.pred_t3 = np.nan_to_num(predictions["t3"], nan=0.0).astype(np.float32)
        self.pred_t8 = np.nan_to_num(predictions["t8"], nan=0.0).astype(np.float32)

        self.window = window
        self.fee = fee

        # === Action space: one continuous [-1..1]
        self.action_space = gym.spaces.Box(
            low=np.array([-1.0], dtype=np.float32),
            high=np.array([1.0], dtype=np.float32),
            shape=(1,),
            dtype=np.float32
        )

        # === Observation size ===
        obs_size = window * 4 + 2
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(obs_size,), dtype=np.float32
        )

        self.reset()

    # =====================================================================
    #                           RESET
    # =====================================================================
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        self.step_index = self.window
        self.position = 0.0
        self.equity = 1.0
        self.prev_equity = 1.0

        obs = self._get_obs()
        return obs, {}

    # =====================================================================
    #                           STEP
    # =====================================================================
    def step(self, action):
        # ========= VALIDATE ACTION =========
        try:
            action = float(action[0])
        except:
            raise ValueError(f"Action has wrong format: {action}")

        if np.isnan(action):
            raise ValueError("ACTION IS NaN!")

        action = np.clip(action, -1, 1)
        new_position = action

        # ========= PRICE =========
        price = self.prices[self.step_index]
        prev_price = self.prices[self.step_index - 1]

        if prev_price == 0:
            prev_price = 1e-6

        # ========= POSITION CHANGE COST =========
        pos_change = abs(new_position - self.position)
        fee_cost = pos_change * self.fee
        self.equity *= (1 - min(fee_cost, 0.99))

        # ========= PRICE CHANGE =========
        price_change = (price - prev_price) / prev_price
        price_change = float(np.nan_to_num(price_change, nan=0.0))

        # ========= EQUITY UPDATE =========
        ret = 1 + self.position * price_change
        ret = float(np.nan_to_num(ret, nan=1.0))

        # prevents collapse
        ret = max(ret, 1e-6)

        self.equity *= ret
        self.equity = float(np.nan_to_num(self.equity, nan=self.prev_equity))

        # ========= REWARD =========
        reward = self.equity - self.prev_equity
        reward = float(np.nan_to_num(reward, nan=0.0))
        if np.isnan(reward) or np.isinf(reward):
            raise ValueError("Reward became NaN/Inf")

        self.prev_equity = self.equity
        self.position = new_position

        # ========= NEXT STEP =========
        self.step_index += 1
        terminated = self.step_index >= len(self.prices) - 1

        obs = self._get_obs()

        # ========= VALIDATE OBS =========
        if np.any(np.isnan(obs)):
            print("\n\n===== NaN FOUND IN OBSERVATION =====")
            print("Step:", self.step_index)
            print("prices slice:", self.prices[self.step_index - self.window:self.step_index])
            print("pred_t1 slice:", self.pred_t1[self.step_index - self.window:self.step_index])
            print("equity:", self.equity)
            print("position:", self.position)
            raise ValueError("OBS contains NaN!")

        return obs, reward, terminated, False, {}

    # =====================================================================
    #                           OBSERVATION
    # =====================================================================
    def _get_obs(self):
        i = self.step_index

        prices = self.prices[i - self.window:i]
        t1 = self.pred_t1[i - self.window:i]
        t3 = self.pred_t3[i - self.window:i]
        t8 = self.pred_t8[i - self.window:i]

        obs = np.concatenate([
            prices,
            t1,
            t3,
            t8,
            np.array([self.position], dtype=np.float32),
            np.array([self.equity], dtype=np.float32),
        ])

        # ABSOLUTE PROTECTION
        obs = np.nan_to_num(obs, nan=0.0, posinf=1e6, neginf=-1e6)

        return obs.astype(np.float32)

    # =====================================================================
    #                           METRICS
    # =====================================================================
    def get_metrics(self):
        return {
            "final_equity": float(self.equity),
            "final_position": float(self.position),
        }


In [10]:

# === 2. Загружаем данные ===
data = pd.read_csv("models/models/new_data.csv")

prices = data["Close"].to_numpy()
pred_t1 = data["predict_t1"].to_numpy()
pred_t3 = data["predict_t3"].to_numpy()
pred_t8 = data["predict_t8"].to_numpy()

# === 3. Train/Test Split ===
test_size = 60 * 24

train_prices = prices[500:-test_size]
train_preds = {
    "t1": pred_t1[500:-test_size],
    "t3": pred_t3[500:-test_size],
    "t8": pred_t8[500:-test_size],
}

test_prices = prices[-test_size:]
test_preds = {
    "t1": pred_t1[-test_size:],
    "t3": pred_t3[-test_size:],
    "t8": pred_t8[-test_size:],
}

# === 4. Создаём окружения ===
window = 60

env_train = TradingEnvSAC(
    prices=train_prices,
    predictions=train_preds,
    window=window,
    fee=0.000
)

env_test = TradingEnvSAC(
    prices=test_prices,
    predictions=test_preds,
    window=window,
    fee=0.001
)

check_env(env_train)
check_env(env_test)

# === 5. Обучаем SAC ===
model = SAC(
    "MlpPolicy",
    env_train,
    verbose=1,
    learning_rate=3e-4,
    gamma=0.99,
    buffer_size=200_000,
    batch_size=256,
    train_freq=1,
    gradient_steps=1,
    tau=0.02,
    ent_coef="auto",   
)

print("🚀 Начинаем обучение SAC...")
model.learn(total_timesteps=400_000)
print("✅ Обучение завершено.")



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
🚀 Начинаем обучение SAC...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.54e+04 |
|    ep_rew_mean     | -0.761   |
| time/              |          |
|    episodes        | 4        |
|    fps             | 34       |
|    time_elapsed    | 1768     |
|    total_timesteps | 61772    |
| train/             |          |
|    actor_loss      | 1.39e+11 |
|    critic_loss     | 8.68e+17 |
|    ent_coef        | 1.09e+08 |
|    ent_coef_loss   | -590     |
|    learning_rate   | 0.0003   |
|    n_updates       | 61671    |
---------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1.54e+04  |
|    ep_rew_mean     | -0.76     |
| time/              |           |
|    episodes        | 8         |
|    fps             | 35        |
|    time_elapsed    | 3528      |
|    total_timesteps

ValueError: Expected parameter loc (Tensor of shape (1, 1)) of distribution Normal(loc: tensor([[nan]], device='cuda:0'), scale: tensor([[nan]], device='cuda:0')) to satisfy the constraint Real(), but found invalid values:
tensor([[nan]], device='cuda:0')

In [8]:
# model.save("models/sac_trading_model")

# === 6. Тестирование (без онлайн-дообучения) ===
obs, info = env_test.reset()
done = False

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env_test.step(action)

print("✅ Тестирование завершено.")

# === 7. Метрики ===
metrics = env_test.get_metrics()
print("\n=== 📊 TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

print("\n📈 Действия:", env_test.get_action_stats())


ValueError: Expected parameter loc (Tensor of shape (1, 1)) of distribution Normal(loc: tensor([[nan]], device='cuda:0'), scale: tensor([[nan]], device='cuda:0')) to satisfy the constraint Real(), but found invalid values:
tensor([[nan]], device='cuda:0')

In [29]:
metrics

{'final_balance': 9176.0419921875,
 'total_return': -0.08239580078125,
 'sharpe': np.float32(-0.17734689),
 'max_drawdown': -0.08794286847114563}

In [23]:
model.save("models/dqn_trading_model")

In [14]:

# === 6. Тестирование (без онлайн-дообучения) ===
obs, info = env_test.reset()
done = False

while not done:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env_test.step(action)

print("✅ Тестирование завершено.")

# === 7. Метрики ===
metrics = env_test.get_metrics()
print("\n=== 📊 TEST METRICS ===")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

print("\n📈 Действия:", env_test.get_action_stats())

ValueError: Expected parameter loc (Tensor of shape (1, 1)) of distribution Normal(loc: tensor([[nan]], device='cuda:0'), scale: tensor([[nan]], device='cuda:0')) to satisfy the constraint Real(), but found invalid values:
tensor([[nan]], device='cuda:0')

In [21]:
data["Close"].isna().sum()

data["predict_t1"].isna().sum()
data["predict_t3"].isna().sum()
data["predict_t8"].isna().sum()

np.int64(61)