In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import talib
import logging

Xây Dựng Môi Trường

In [2]:
logging.basicConfig(filename='train_trading_PPO.log', level=logging.INFO, 
                    format='%(asctime)s - %(message)s')

In [3]:
class AVGOTradingEnv(gym.Env):
    def __init__(self, df, initial_cash = 100000, buy_min = 0.39, buy_max = 0.75,
                sell_min = 0.23, sell_max = 0.75, penalty = 3072):
        super(AVGOTradingEnv, self).__init__()
        self.df = df.reset_index()
        self.initial_cash = initial_cash
        self.cash = initial_cash
        self.current_step = 0
        self.shares_held = 0
        self.total_assets = initial_cash
        self.days_without_trade = 0
        
        # Add Indicator
        self.df["SMA_10"] = talib.SMA(self.df["Close"], timeperiod=10)
        self.df["SMA_50"] = talib.SMA(self.df["Close"], timeperiod=50)
        self.df["SMA_200"] = talib.SMA(self.df["Close"], timeperiod=200)
        self.df["RSI"] = talib.RSI(self.df["Close"], timeperiod=14)
        macd, signal, _ = talib.MACD(self.df["Close"], fastperiod=12, slowperiod=26, signalperiod=9)
        self.df["MACD"] = macd - signal
        upper, middle, lower = talib.BBANDS(self.df["Close"], timeperiod=20)
        self.df["BB_Upper"] = upper
        self.df["BB_Lower"] = lower
        
        self.df.fillna(method="bfill", inplace=True)
        
        # Action space (0: Hold, 1: Buy, 2: Sell)
        self.action_space = spaces.Discrete(3)
        
        # Observation space (giá đóng cửa, tiền mặt, số cổ phiếu, tổng tài sản, số ngày không giao dịch)
        self.observation_space = spaces.Box(
            low=0, high=np.inf, shape=(10,), dtype=np.float32
        )
    
    def reset(self, seed = None, options = None):
        self.current_step = 0
        self.cash = self.initial_cash
        self.shares_held = 0
        self.total_assets = self.initial_cash
        obs = self._next_observation()  # Quan sát đầu vào

        logging.info(f"Starting environment with {self.initial_cash} cash.")

        return obs, {}
    
    def _next_observation(self):
        row = self.df.loc[self.current_step]
        return np.array([
            row["Close"], self.cash, self.shares_held, self.total_assets, self.days_without_trade,
            row["SMA_10"], row["SMA_50"], row["SMA_200"], row["RSI"], row["MACD"]
        ], dtype=np.float32)

    def step(self, action):
        if self.current_step >= len(self.df) - 1:
            truncated = True
            return self._next_observation(), 0, True, truncated, {}

        current_price = self.df.loc[self.current_step, "Close"]
        rsi = self.df.loc[self.current_step, "RSI"]
        
        #RSI Strategy
        if action == 1 and rsi > 70:
            action = 0
        
        elif action == 2 and rsi < 30:
            action = 0
        
        if action == 1:  # Mua
            buy_amount = np.random.uniform(0.39, 0.75) * self.cash
            shares_bought = buy_amount // current_price
            self.cash -= shares_bought * current_price
            self.shares_held += shares_bought
            self.days_without_trade = 0
        elif action == 2 and self.shares_held > 0:  # Bán
            sell_amount = np.random.uniform(0.23, 0.75) * self.shares_held
            self.cash += sell_amount * current_price
            self.shares_held -= sell_amount
            self.days_without_trade = 0
        else:
            self.days_without_trade += 1
            if self.days_without_trade >= 35:
                self.cash -= 3072

        self.total_assets = self.cash + self.shares_held * current_price
        reward = self.total_assets - self.initial_cash
        done = self.cash <= -5000 or self.total_assets >= 1000000 or self.total_assets < 1000
        truncated = self.current_step >= len(self.df) - 1

        logging.info(f"Step: {self.current_step}, Action: {action}, Cash: {self.cash}, "
                     f"Shares: {self.shares_held}, Total Assets: {self.total_assets}, "
                     f"Reward: {reward}, RSI: {rsi}")
        
        self.current_step += 1
        obs = self._next_observation()
        
        return obs, reward, done, truncated, {}
    
    def render(self, mode="human"):
        current_date = self.df.loc[self.current_step, "Date"]
        print(f"Day: {current_date}, Cash: {self.cash}, Shares: {self.shares_held}, Total Assets: {self.total_assets}")
    

Train Agent

In [4]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

In [5]:
# Load dữ liệu OHLC
df = pd.read_csv("data/data_train.csv")
df = df.iloc[::-1].reset_index(drop=True)

# Khởi tạo môi trường
env = make_vec_env(lambda: AVGOTradingEnv(df), n_envs=1)

# Huấn luyện mô hình PPO
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=500000)

# Lưu mô hình
model.save("ppo_stock_trading")

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 609  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 2.26e+03     |
|    ep_rew_mean          | 4.61e+08     |
| time/                   |              |
|    fps                  | 386          |
|    iterations           | 2            |
|    time_elapsed         | 10           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 8.440111e-10 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.1         |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 6.37e+12     |
|    n_updates            | 10           |

Test Agent

In [6]:
df = pd.read_csv("data/data_test.csv")
df = df.iloc[::-1].reset_index(drop=True)

model = PPO.load("ppo_stock_trading")

# Tạo môi trường với tập test
env_test = AVGOTradingEnv(df)
obs, _ = env_test.reset()

# Danh sách lưu lại toàn bộ states
states_history = []

# Chạy mô hình trên tập test
for _ in range(len(df)):
    action, _states = model.predict(obs)
    obs, reward, done, truncated, _ = env_test.step(action)
    
    # ✅ Lưu lại trạng thái vào danh sách
    total_assets = env_test.total_assets
    states_history.append((obs, total_assets))  # Lưu cả state và total_assets
    
    env_test.render()
    if done:
        break

Day: 2023-02-07, Cash: 100000, Shares: 0, Total Assets: 100000.0
Day: 2023-02-08, Cash: 100000, Shares: 0, Total Assets: 100000.0
Day: 2023-02-09, Cash: 100000, Shares: 0, Total Assets: 100000.0
Day: 2023-02-10, Cash: 100000, Shares: 0, Total Assets: 100000.0
Day: 2023-02-13, Cash: 100000, Shares: 0, Total Assets: 100000.0
Day: 2023-02-14, Cash: 36762.176, Shares: 1052.0, Total Assets: 100000.0
Day: 2023-02-15, Cash: 36762.176, Shares: 1052.0, Total Assets: 100125.188
Day: 2023-02-16, Cash: 36762.176, Shares: 1052.0, Total Assets: 100691.164
Day: 2023-02-17, Cash: 15142.735999999997, Shares: 1412.0, Total Assets: 99938.984
Day: 2023-02-21, Cash: 15142.735999999997, Shares: 1412.0, Total Assets: 99240.044
Day: 2023-02-22, Cash: 7175.637999999996, Shares: 1549.0, Total Assets: 97256.184
Day: 2023-02-23, Cash: 28400.438680200674, Shares: 1180.6225995765021, Total Assets: 96424.371
Day: 2023-02-24, Cash: 16490.918680200673, Shares: 1384.6225995765021, Total Assets: 97325.18604347688
Day: 2