In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
prices_df = pd.read_hdf('data/binance_BTC_USDT.h5')

In [6]:
env = SimulationEnv(prices=prices_df['price'].to_numpy()[:1_000], initial_balance=10_000, fee=1e-3)

In [7]:
state, done = env.reset()

In [8]:
env.step(1)

((0, 4280.5599999999995, 2.3442559861832044, 4261.48, 4261.48), 0, False)

In [9]:
env.step(2)

((10024.69367581216, 4261.48, 0, 4271.0199999999995, 4261.857821782178),
 0.0,
 False)

In [10]:
import torch
import torch.nn as nn

class Network:
    def __init__(self) -> None:
        # Use GPU if available.
        self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self._model = nn.Sequential(
            nn.Linear(5, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 3)
        ).to(self._device)

        self._optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-3)
        self._loss = nn.MSELoss()

    def train(self, states: np.ndarray, actions: np.ndarray, q_values: np.ndarray) -> None:
        states = torch.from_numpy(states).float().to(self._device)
        q_values = torch.from_numpy(q_values).float().to(self._device)
        actions = torch.from_numpy(actions).long().to(self._device)

        self._model.train()
        self._optimizer.zero_grad()
        predictions = self._model(states)
        predictions = torch.gather(predictions, dim=1, index=actions)
        loss = self._loss(predictions, q_values)
        loss.backward()
        #nn.utils.clip_grad_norm_(self._model.parameters(), 10)
        self._optimizer.step()

    def predict(self, states: np.ndarray) -> np.ndarray:
        states = torch.from_numpy(states).float().to(self._device)
        self._model.eval()
        with torch.no_grad():
            return self._model(states).cpu().numpy()

    def copy_weights_from(self, other) -> None:
        params = dict(self._model.named_parameters())
        params_other = dict(other._model.named_parameters())
        with torch.no_grad():
            for name, value in params_other.items():
                params[name].data.copy_(value.data)

In [11]:
import collections
import random

network = Network()

epsilon = 0.5
gamma = 0.99

replay_buffer = collections.deque()

Transition = collections.namedtuple("Transition", ["state", "action", "reward", "done", "next_state"])

for episode in range(20):
    state, done = env.reset()
    episode_return = 0

    while not done:
        with torch.no_grad():
            q_values = network.predict(np.array([state], dtype=np.float32))[0]
        if np.random.uniform() >= epsilon:
            action = np.argmax(q_values)
        else:
            action = np.random.randint(0, 3)

        next_state, reward, done = env.step(action)
        episode_return += reward
        replay_buffer.append(Transition(state, action, reward, done, next_state))


        if len(replay_buffer) > 512:
            minibatch = random.sample(replay_buffer, 512)
            states = np.vstack([t.state for t in minibatch])
            actions = np.vstack([t.action for t in minibatch])
            rewards = np.vstack([t.reward for t in minibatch])
            next_states = np.vstack([t.next_state for t in minibatch])
            dones = np.vstack([t.done for t in minibatch]).astype(np.uint8)

            with torch.no_grad():
                q_values_next = network.predict(next_states)
                q_values_next = q_values_next.max(axis=1).reshape(-1, 1)
                target_q_values = rewards + (1 - dones) * gamma * q_values_next
            network.train(states, actions, target_q_values)

        state = next_state
    print(f'Episode: {episode} return: {episode_return}')

Episode: 0 return: 224.99744838047303
Episode: 1 return: 264.531597508022
Episode: 2 return: 10.526600750194191
Episode: 3 return: 463.344433536383
Episode: 4 return: 107.36902379122189
Episode: 5 return: 659.0909868725554
Episode: 6 return: 93.25250433980881
Episode: 7 return: -754.3228877029144
Episode: 8 return: -12.09655519360639
Episode: 9 return: -4.448079854905141
Episode: 10 return: 404.5836446362279
Episode: 11 return: 202.75678159449467
Episode: 12 return: -403.27718192251444
Episode: 13 return: -319.655870002426
Episode: 14 return: -126.48002786972344
Episode: 15 return: -267.77574004643776
Episode: 16 return: 175.35113259349407
Episode: 17 return: 712.920821829259
Episode: 18 return: 370.2768818837727
Episode: 19 return: -240.6320419818552


In [12]:
4261.48 - 4280.56

-19.080000000000837

In [34]:
env.reset()

(array([ 3829.47615283,   872.362006  ,  3829.47615283,  3829.47615283,
            0.        ,    50.        ,  3829.47615283, 10000.        ,
            0.        ]),
 {})

In [89]:
env.step(1)

(array([ 3.80494806e+03,  3.85813469e+02,  3.80876260e+03,  3.81634181e+03,
        -6.61621262e-01,  4.21757695e+01,  3.81111808e+03,  0.00000000e+00,
         2.62047935e+00]),
 -13.047519166573693,
 False,
 False,
 {})

In [1]:
import pandas as pd

prices_df = pd.read_hdf('data/binance_BTC_USDT.h5')


Unnamed: 0_level_0,amount,price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1502942460,1.775183,4261.480000
1502942580,0.261074,4280.560000
1502942640,0.012008,4261.480000
1502942700,0.140796,4261.480000
1502943480,0.075455,4262.187216
...,...,...
1670479020,140.372990,16822.509019
1670479080,135.652550,16823.593779
1670479140,106.761210,16823.667239
1670479200,160.929330,16820.593849


In [22]:
prices_df.index = pd.to_datetime(prices_df.index * 1e9)
prices_df

Unnamed: 0_level_0,amount,price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-17 04:01:00,1.775183,4261.480000
2017-08-17 04:03:00,0.261074,4280.560000
2017-08-17 04:04:00,0.012008,4261.480000
2017-08-17 04:05:00,0.140796,4261.480000
2017-08-17 04:18:00,0.075455,4262.187216
...,...,...
2022-12-08 05:57:00,140.372990,16822.509019
2022-12-08 05:58:00,135.652550,16823.593779
2022-12-08 05:59:00,106.761210,16823.667239
2022-12-08 06:00:00,160.929330,16820.593849


In [28]:
hourly_prices = prices_df.groupby(pd.Grouper(freq='H')).agg({'amount': 'sum', 'price': 'last'})

In [99]:
hourly_prices['price'] = hourly_prices['price'].ffill()

In [91]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import talib
import matplotlib.pyplot as plt

import time
from IPython import display
%matplotlib

class StockExchangeEnv(gym.Env):
    def __init__(self, price_data: np.ndarray, volume_data: np.ndarray, initial_cash: int, max_steps: int = 720, trading_fee: float = 0.0):
        super(StockExchangeEnv, self).__init__()

        self.price_data = price_data
        self.volume_data = volume_data
        self.initial_cash = initial_cash
        self.trading_fee = trading_fee
        self.max_steps = max_steps
        self.balance_history = []
        self.action_history = []
        self.reward_history = []
        self.net_worth_changes = []
        self.price_line = None
        self.start_step = 0

        # Define action space: 0 - hold, 1 - buy, 2 - sell
        self.action_space = spaces.Discrete(3)

        # Define observation space: normalized price, volume, cash balance, asset holdings
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(9,), dtype=np.float32)

        self.reset()

    def reset(self, *, seed=None, options=None):
        if seed is not None:
            rng = np.random.default_rng(seed=seed)
            self.current_step = rng.integers(0, len(self.price_data) - self.max_steps)
            self.start_step = self.current_step
        else:
            self.current_step = np.random.randint(len(self.price_data) - self.max_steps)
            self.start_step = self.current_step
        self.cash_balance = self.initial_cash
        self.asset_holdings = 0
        return self._get_observation(), {}

    def step(self, action):
        assert self.action_space.contains(action)

        current_price = self.price_data[self.current_step]
        current_volume = self.volume_data[self.current_step]

        self.balance_history.append(self.cash_balance + self.asset_holdings * current_price)

        if action == 1:  # Buy
            amount_to_buy = (1 * self.cash_balance) / current_price
            cost = amount_to_buy * current_price
            self.cash_balance -= cost
            self.asset_holdings += amount_to_buy * (1 - self.trading_fee)
            self.action_history.append(1)
        elif action == 2:  # Sell
            amount_to_sell = self.asset_holdings
            revenue = amount_to_sell * current_price * (1 - self.trading_fee)
            self.cash_balance += revenue
            self.asset_holdings = 0
            self.action_history.append(-1)
        else:
            self.action_history.append(0)

        self.current_step += 1
        done = self.current_step == self.start_step + self.max_steps
        reward = self._get_reward()
        self.reward_history.append(reward)

        return self._get_observation(), reward, done, False, {}

    def _get_observation(self):
        current_price = self.price_data[self.current_step]
        current_volume = self.volume_data[self.current_step]

        short_mavg = talib.SMA(self.price_data[self.start_step:self.current_step + 1], timeperiod=5)[-1] if self.current_step - self.start_step >= 4 else current_price
        long_mavg = talib.SMA(self.price_data[self.start_step:self.current_step + 1], timeperiod=20)[-1] if self.current_step - self.start_step >= 19 else current_price

        if self.current_step - self.start_step >= 26:
            macd, macd_signal, _ = talib.MACD(self.price_data[self.start_step:self.current_step + 1], fastperiod=12, slowperiod=26, signalperiod=9)
            macd_diff = macd[-1] - macd_signal[-1]
            if np.isnan(macd_diff):
                macd_diff = 0
        else:
            macd_diff = 0

        rsi = talib.RSI(self.price_data[self.start_step:self.current_step + 1], timeperiod=14)[-1] if self.current_step - self.start_step >= 14 else 50

        ema = talib.EMA(self.price_data[self.start_step:self.current_step + 1], timeperiod=12)[-1] if self.current_step - self.start_step >= 11 else current_price

        observation = np.array([current_price, current_volume, short_mavg, long_mavg, macd_diff, rsi, ema, self.cash_balance, self.asset_holdings])
        if any(np.isnan(observation)):
            print(observation)
        return observation

    def _get_reward(self):
        current_net_worth_change = self.cash_balance + self.asset_holdings * self.price_data[self.current_step] - self.initial_cash
        if len(self.net_worth_changes) > 0:
            previous_net_worth_change = self.net_worth_changes[-1]
        else:
            previous_net_worth_change = 0
        self.net_worth_changes.append(current_net_worth_change)  # Add the net worth change to the list
        return current_net_worth_change - previous_net_worth_change

    def render(self, mode='human'):
        if self.current_step == self.start_step:
            self.fig, (self.ax1, self.ax2, self.ax3, self.ax4) = plt.subplots(4, 1, figsize=(10, 12), gridspec_kw={'height_ratios': [3, 3, 3, 1]}, sharex=True)
            self.ax1.set_title('Stock Price')
            self.ax1.set_xlabel('Step')
            self.ax1.set_ylabel('Price')

            self.ax2.set_title('Portfolio Value')
            self.ax2.set_xlabel('Step')
            self.ax2.set_ylabel('Amount')

            self.ax3.set_title('Rewards')
            self.ax3.set_xlabel('Step')
            self.ax3.set_ylabel('Sharpe Ratio')

            self.ax4.set_title('Action History')
            self.ax4.set_xlabel('Step')
            self.ax4.set_ylabel('Action')
        else:
            x_data = np.arange(self.current_step - self.start_step)
            y_data = self.price_data[self.start_step:self.current_step]
            self.ax1.plot(x_data, y_data, color='C0')
            self.ax2.plot(x_data, self.balance_history[:self.current_step - self.start_step], color='C1')
            self.ax3.plot(x_data, self.reward_history[:self.current_step - self.start_step], color='C1')
            self.ax4.plot(x_data, self.action_history[:self.current_step - self.start_step], color='C1')
            plt.pause(0.01)
            #time.sleep(0.01)

env = StockExchangeEnv(hourly_prices['price'].to_numpy(), hourly_prices['amount'].to_numpy(), 10_000)

env.reset()
for i in range(100):
    env.render()
    current_step = env.current_step
    current_price = env.price_data[current_step]
    next_price = env.price_data[current_step + 1]
    if next_price > current_price:
        env.step(1)
    elif next_price < current_price:
        env.step(2)
    else:
        env.step(0)

# for i in range(100):
#     env.render()
#     current_step = env.current_step
#     current_price = env.price_data[current_step]
#     next_price = env.price_data[current_step + 1]
#     try:
#         next_next_price = env.price_data[current_step + 2]
#     except IndexError:
#         next_next_price = 0
#     print(f'{current_price=:.2f}\t{next_price=:.2f}\t{next_next_price=:.2f}')
#     print('Enter next action: 0 HOLD, 1 BUY, 2 SELL\n')
#     time.sleep(0.1)
#     action = int(input())
#     env.step(action)
plt.show()

Using matplotlib backend: TkAgg


In [30]:
import ray
from ray import tune

from ray.tune.registry import register_env

In [101]:
ray.shutdown()
ray.init()

register_env('StockExchangeEnv-v0', lambda config: StockExchangeEnv(**config))

2023-05-05 23:20:54,623	INFO worker.py:1616 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


In [102]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

dqn = (
    PPOConfig()
    .rollouts(num_rollout_workers=0)
    .resources(num_gpus=1)
    .environment(env='StockExchangeEnv-v0', env_config={'price_data': hourly_prices["price"].to_numpy(), 'volume_data': hourly_prices["amount"].to_numpy(), 'initial_cash': 10_000})
    .build()
)

for i in range(500):
    result = dqn.train()
    print(f'Step: {result["training_iteration"]}\tMean return: {result["episode_reward_mean"]}')



Step: 1	Mean return: -42.865930799546184
Step: 2	Mean return: -185.89766800128746
Step: 3	Mean return: 169.17498209446853
Step: 4	Mean return: -88.3142230583735
Step: 5	Mean return: 19.19788326543157
Step: 6	Mean return: -16.45068610735584
Step: 7	Mean return: 7.063555817371994
Step: 8	Mean return: -38.02530962088903
Step: 9	Mean return: -41.522897312310235
Step: 10	Mean return: -17.97407142151989
Step: 11	Mean return: -19.678367718046115
Step: 12	Mean return: 11.896916841168824
Step: 13	Mean return: -2.235516732331765
Step: 14	Mean return: 12.66614309286765
Step: 15	Mean return: -4.661710404750021
Step: 16	Mean return: -16.956172854057762
Step: 17	Mean return: -11.008859576698496
Step: 18	Mean return: 2.480511230930242
Step: 19	Mean return: -0.4883398345264868
Step: 20	Mean return: 26.459753117181773
Step: 21	Mean return: -44.16181603506087
Step: 22	Mean return: 20.183706555358384
Step: 23	Mean return: -3.228635785139504
Step: 24	Mean return: 24.918146317321316
Step: 25	Mean return: 8

Exception ignored in: <function Variable.__del__ at 0x7f17eafc2950>
Traceback (most recent call last):
  File "/home/fassty/anaconda3/envs/diploma_thesis/lib/python3.10/tkinter/__init__.py", line 388, in __del__
    if self._tk.getboolean(self._tk.call("info", "exists", self._name)):
RuntimeError: main thread is not in main loop


Collecting mujoco-py<2.2,>=2.1
  Downloading mujoco_py-2.1.2.14-py3-none-any.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting mujoco==2.2
  Downloading mujoco-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting gym==0.26.2
  Using cached gym-0.26.2.tar.gz (721 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting box2d-py==2.3.5
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25h

In [100]:
hourly_prices['price'].isna().any()

False