In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import talib
import logging

Xây Dựng Môi Trường

In [2]:
logging.basicConfig(filename='train_trading_DDPG.log', level=logging.INFO, 
                    format='%(asctime)s - %(message)s')

In [3]:
class AVGOTradingEnv(gym.Env):
    def __init__(self, df, initial_cash=100000, penalty=3072):
        super(AVGOTradingEnv, self).__init__()
        
        self.df = df.reset_index()
        self.initial_cash = initial_cash
        self.cash = initial_cash
        self.shares_held = 0
        self.total_assets = initial_cash
        self.current_step = 0
        self.days_without_trade = 0
        self.penalty = penalty
        
        # Add Indicators
        self.df["SMA_10"] = talib.SMA(self.df["Close"], timeperiod=10)
        self.df["SMA_50"] = talib.SMA(self.df["Close"], timeperiod=50)
        self.df["SMA_200"] = talib.SMA(self.df["Close"], timeperiod=200)
        self.df["RSI"] = talib.RSI(self.df["Close"], timeperiod=14)
        macd, signal, _ = talib.MACD(self.df["Close"], fastperiod=12, slowperiod=26, signalperiod=9)
        self.df["MACD"] = macd - signal
        upper, middle, lower = talib.BBANDS(self.df["Close"], timeperiod=20)
        self.df["BB_Upper"] = upper
        self.df["BB_Lower"] = lower
        
        self.df.fillna(method="bfill", inplace=True)

        # Action space: continuous value between -1 (sell) and 1 (buy)
        self.action_space = spaces.Box(low=-1, high=1, shape=(1,), dtype=np.float32)

        # Observation space: giá trị thực tế của các chỉ báo
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(10,), dtype=np.float32
        )
    
    def reset(self, seed=None, options=None):
        self.current_step = 0
        self.cash = self.initial_cash
        self.shares_held = 0
        self.total_assets = self.initial_cash
        self.days_without_trade = 0

        logging.info(f"Starting environment with {self.initial_cash} cash.")
        
        return self._next_observation(), {}
    
    def _next_observation(self):
        row = self.df.iloc[self.current_step]
        return np.array([
            row["Close"], self.cash, self.shares_held, self.total_assets, self.days_without_trade,
            row["SMA_10"], row["SMA_50"], row["SMA_200"], row["RSI"], row["MACD"]
        ], dtype=np.float32)

    def step(self, action):
        if self.current_step >= len(self.df) - 1:
            return self._next_observation(), 0, True, False, {}

        action = np.clip(action, -1, 1)[0]  # Đảm bảo action nằm trong khoảng [-1, 1]
        current_price = self.df.loc[self.current_step, "Close"]
        rsi = self.df.loc[self.current_step, "RSI"]

        if rsi < 30 and action < 0:  # Không bán khi RSI < 30
            action = 0
        elif rsi > 70 and action > 0:  # Không mua khi RSI > 70
            action = 0
        
        if action > 0:  # Mua
            min_buy = 0.39 * self.cash
            max_buy = 0.75 * self.cash
            buy_amount = np.random.uniform(min_buy, max_buy)
            shares_bought = buy_amount // current_price
            self.cash -= shares_bought * current_price
            self.shares_held += shares_bought
            self.days_without_trade = 0
        elif action < 0 and self.shares_held > 0:  # Bán
            min_sell = 0.23 * self.shares_held
            max_sell = 0.75 * self.shares_held
            sell_amount = np.random.uniform(min_sell, max_sell)   # Tỷ lệ cổ phiếu bán theo action
            self.cash += sell_amount * current_price
            self.shares_held -= sell_amount
            self.days_without_trade = 0
        else:  # Không giao dịch
            self.days_without_trade += 1
            if self.days_without_trade >= 35:
                self.cash -= self.penalty  # Phạt nếu không giao dịch quá lâu

        self.total_assets = self.cash + self.shares_held * current_price
        reward = self.total_assets - self.initial_cash  # Lợi nhuận

        done = self.cash <= -5000 or self.total_assets >= 1000000 or self.total_assets < 1000
        truncated = self.current_step >= len(self.df) - 1

        logging.info(f"Step: {self.current_step}, Action: {action}, Cash: {self.cash}, "
                     f"Shares: {self.shares_held}, Total Assets: {self.total_assets}, "
                     f"Reward: {reward}, RSI: {rsi}")

        self.current_step += 1
        return self._next_observation(), reward, done, truncated, {}
    
    def render(self, mode="human"):
        current_date = self.df.loc[self.current_step, "Date"]
        print(f"Day: {current_date}, Cash: {self.cash}, Shares: {self.shares_held}, Total Assets: {self.total_assets}")

Train Agent

In [4]:
from stable_baselines3 import DDPG
from stable_baselines3.common.env_util import make_vec_env

In [5]:
# Load dữ liệu OHLC
df = pd.read_csv("data/data_train.csv")
df = df.iloc[::-1].reset_index(drop=True)

env = make_vec_env(lambda: AVGOTradingEnv(df), n_envs=1)

# Train DDPG
model = DDPG("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)

# Save model
model.save("ddpg_trading_model")

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.96e+03 |
|    ep_rew_mean     | 6.31e+08 |
| time/              |          |
|    episodes        | 4        |
|    fps             | 58       |
|    time_elapsed    | 134      |
|    total_timesteps | 7821     |
| train/             |          |
|    actor_loss      | -1.1e+07 |
|    critic_loss     | 1.22e+11 |
|    learning_rate   | 0.001    |
|    n_updates       | 7720     |
---------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1.95e+03  |
|    ep_rew_mean     | 6.38e+08  |
| time/              |           |
|    episodes        | 8         |
|    fps             | 59        |
|    time_elapsed    | 264       |
|    total_timesteps | 15617     |
| train/             |           |
|    actor_loss      | -1.79e+07 |
|    critic_loss     | 1.84e+11  |
|    learning_rate   | 0.001     |
|    n_updates    

Test Agent

In [6]:
df = pd.read_csv("data/data_test.csv")
df = df.iloc[::-1].reset_index(drop=True)

model = DDPG.load("ddpg_trading_model")

env = AVGOTradingEnv(df)
obs, _ = env.reset()
done = False

while not done:
    action, _states = model.predict(obs)
    obs, reward, done, truncated, info = env.step(action)
    env.render()


Day: 2023-02-07, Cash: 56044.97, Shares: 731.0, Total Assets: 100000.0
Day: 2023-02-08, Cash: 16597.28, Shares: 1373.0, Total Assets: 100961.26500000001
Day: 2023-02-09, Cash: 9737.785999999998, Shares: 1487.0, Total Assets: 99212.06300000001
Day: 2023-02-10, Cash: 3443.980999999998, Shares: 1592.0, Total Assets: 98870.053
Day: 2023-02-13, Cash: 1842.2059999999979, Shares: 1619.0, Total Assets: 97889.381
Day: 2023-02-14, Cash: 880.4139999999978, Shares: 1635.0, Total Assets: 99163.53400000001
Day: 2023-02-15, Cash: 519.0279999999977, Shares: 1641.0, Total Assets: 99358.09899999999
Day: 2023-02-16, Cash: 275.9519999999977, Shares: 1645.0, Total Assets: 100240.95700000001
Day: 2023-02-17, Cash: 95.78999999999772, Shares: 1648.0, Total Assets: 99064.78199999999
Day: 2023-02-21, Cash: 36.230999999997714, Shares: 1649.0, Total Assets: 98249.02200000001
Day: 2023-02-22, Cash: 36.230999999997714, Shares: 1649.0, Total Assets: 95932.17700000001
Day: 2023-02-23, Cash: 36.230999999997714, Shares