In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import talib
import gymnasium as gym
import ray

from rl_trading.simulation.env import StockExchangeEnv

Using matplotlib backend: <object object at 0x7fab59682960>


In [2]:
exchange_data = pd.read_hdf('../data/binance_BTC_USDT.h5')
exchange_data.sort_index(inplace=True)
exchange_data = exchange_data[~exchange_data.index.duplicated(keep='first')]
exchange_data = exchange_data.reindex(np.arange(exchange_data.index[0], exchange_data.index[-1] + 1, 60))
exchange_data['price'] = exchange_data['price'].ffill()
exchange_data['amount'] = exchange_data['amount'].fillna(value=0)

display(exchange_data.head(5))

exchange_data.index = pd.to_datetime(exchange_data.index * 1e9)
price_data_1m = exchange_data['price'].to_numpy()
volume_data_1m = exchange_data['amount'].to_numpy()

agg_mapping = {'price': 'last', 'amount': 'sum'}
hourly_data = exchange_data.groupby(pd.Grouper(freq='1h')).agg(agg_mapping)
daily_data = exchange_data.groupby(pd.Grouper(freq='1d')).agg(agg_mapping)
weekly_data = exchange_data.groupby(pd.Grouper(freq='1w')).agg(agg_mapping)
biweekly_data = exchange_data.groupby(pd.Grouper(freq='2w')).agg(agg_mapping)
monthly_data = exchange_data.groupby(pd.Grouper(freq='1M')).agg(agg_mapping)

display(hourly_data.head(5))
display(daily_data.head(5))
display(weekly_data.head(5))
display(biweekly_data.head(5))
display(monthly_data.head(5))

Unnamed: 0_level_0,amount,price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1502942460,1.775183,4261.48
1502942520,0.0,4261.48
1502942580,0.261074,4280.56
1502942640,0.012008,4261.48
1502942700,0.140796,4261.48


Unnamed: 0_level_0,price,amount
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-17 04:00:00,4311.749242,46.620743
2017-08-17 05:00:00,4315.32,23.795182
2017-08-17 06:00:00,4324.35,7.229691
2017-08-17 07:00:00,4334.0,2.482438
2017-08-17 08:00:00,4360.69,2.933618


Unnamed: 0_level_0,price,amount
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-17,4285.08,795.150377
2017-08-18,4115.4,1199.699723
2017-08-19,4139.98,381.498304
2017-08-20,4069.13,466.704213
2017-08-21,4016.0,691.925435


Unnamed: 0_level_0,price,amount
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-20,4069.13,2843.052617
2017-08-27,4346.972375,4599.745731
2017-09-03,4505.581286,4753.873083
2017-09-10,4150.230425,6381.632307
2017-09-17,3699.015885,8107.831437


Unnamed: 0_level_0,price,amount
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-20,4069.13,2843.052617
2017-09-03,4505.581286,9353.618814
2017-09-17,3699.015885,14489.463744
2017-10-01,4378.48,11690.657902
2017-10-15,5699.844347,12620.142176


Unnamed: 0_level_0,price,amount
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-31,4724.885015,10015.640272
2017-09-30,4347.630427,27634.043533
2017-10-31,6476.467415,41625.312848
2017-11-30,9846.140933,108483.8836
2017-12-31,13735.11887,408477.194296


In [None]:
import torch
import ray
from ray.rllib.algorithms.ppo import PPOConfig

ray.shutdown()
ray.init()

from rl_trading.simulation.env import StockExchangeEnv
from ray.tune.registry import register_env
register_env('StockExchangeEnv-v0', lambda config: StockExchangeEnv(**config))

dqn = (
    PPOConfig()
    .rollouts(num_rollout_workers=0)
    .training(use_gae=True, entropy_coeff=0.01, model={'fcnet_hiddens': [512, 512, 512], 'lstm_cell_size': 512, 'use_lstm': True, 'max_seq_len': 30, 'lstm_use_prev_action': True, 'lstm_use_prev_reward': True})
    .resources(num_gpus=1)
    .environment(env='StockExchangeEnv-v0', env_config={'market_data': exchange_data})
    .build()
)

render_env = StockExchangeEnv(market_data=exchange_data)

for i in range(10_000):
    # if i % 10 == 0:
    #     done = False
    #     obs, _ = render_env.reset(seed=42)
    #     init_state = state = [np.zeros([512], np.float32) for _ in range(2)]
    #     prev_a = 0
    #     prev_r = 0.0
    #     for _ in range(100):
    #         render_env.render()
    #         action, state, _ = dqn.compute_single_action(obs, state, prev_action=prev_a, prev_reward=prev_r)
    #         obs, reward, terminated, truncated, _ = render_env.step(action)
    #         done = terminated or truncated
    #     plt.savefig(f'epoch_{i + 1}.png')
    #     plt.close()
    result = dqn.train()
    print(f'Step: {result["training_iteration"]}\t\tMean return: {result["episode_reward_mean"]}')


2023-05-07 02:46:53,380	INFO worker.py:1616 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


Step: 1		Mean return: 181.46408259921736
Step: 2		Mean return: 167.08627415659467
Step: 3		Mean return: 132.92288695711272
Step: 4		Mean return: 97.21124122700827
Step: 5		Mean return: 103.8725222495282
Step: 6		Mean return: 82.32537358767684
Step: 7		Mean return: 33.731817433939796
Step: 8		Mean return: 1.7799985922600337
Step: 9		Mean return: 9.441827125126801
Step: 10		Mean return: 9.221799949570359
Step: 11		Mean return: -28.579124735001884
Step: 12		Mean return: -44.3399082733087
Step: 13		Mean return: -38.127305779426834
Step: 14		Mean return: -33.40065113868671
Step: 15		Mean return: -21.07738372941542
Step: 16		Mean return: 6.432485019608934
Step: 17		Mean return: -4.053340399387688
Step: 18		Mean return: -18.225644727861873
Step: 19		Mean return: -16.728707937237896
Step: 20		Mean return: -24.493023825081103
Step: 21		Mean return: 0.9143914248411891
Step: 22		Mean return: 0.24354719195333496
Step: 23		Mean return: -5.233707857418844
Step: 24		Mean return: -4.562744412874688
St

In [None]:
env = StockExchangeEnv(market_data=exchange_data)

In [None]:
env.reset()
done = False
while not done:
    env.render()
    current_idx = env.current_idx
    current_price = env.price_data[current_idx]
    next_price = env.price_data[current_idx + 1]
    if next_price > current_price:
        action = 1
    elif next_price < current_price:
        action = 2
    else:
        action = 0
    state, reward, done, _, _ = env.step(action)

Exception in Tkinter callback
Traceback (most recent call last):
  File "/home/fassty/anaconda3/envs/diploma_thesis/lib/python3.10/tkinter/__init__.py", line 1921, in __call__
    return self.func(*args)
  File "/home/fassty/anaconda3/envs/diploma_thesis/lib/python3.10/tkinter/__init__.py", line 839, in callit
    func(*args)
  File "/home/fassty/anaconda3/envs/diploma_thesis/lib/python3.10/site-packages/matplotlib/backends/_backend_tk.py", line 271, in idle_draw
    self.draw()
  File "/home/fassty/anaconda3/envs/diploma_thesis/lib/python3.10/site-packages/matplotlib/backends/backend_tkagg.py", line 10, in draw
    super().draw()
  File "/home/fassty/anaconda3/envs/diploma_thesis/lib/python3.10/site-packages/matplotlib/backends/backend_agg.py", line 400, in draw
    self.figure.draw(self.renderer)
  File "/home/fassty/anaconda3/envs/diploma_thesis/lib/python3.10/site-packages/matplotlib/artist.py", line 95, in draw_wrapper
    result = draw(artist, renderer, *args, **kwargs)
  File "/

In [54]:
from rl_trading.simulation.env import SimulationConfig
from dataclasses import asdict
asdict(SimulationConfig())

{'granularity': '1m', 'max_steps': 1440, 'inital_cash': 10000}

In [8]:
env.step(1)

((0, 4280.5599999999995, 2.3442559861832044, 4261.48, 4261.48), 0, False)

In [9]:
env.step(2)

((10024.69367581216, 4261.48, 0, 4271.0199999999995, 4261.857821782178),
 0.0,
 False)

In [10]:
import torch
import torch.nn as nn

class Network:
    def __init__(self) -> None:
        # Use GPU if available.
        self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self._model = nn.Sequential(
            nn.Linear(5, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 3)
        ).to(self._device)

        self._optimizer = torch.optim.Adam(self._model.parameters(), lr=1e-3)
        self._loss = nn.MSELoss()

    def train(self, states: np.ndarray, actions: np.ndarray, q_values: np.ndarray) -> None:
        states = torch.from_numpy(states).float().to(self._device)
        q_values = torch.from_numpy(q_values).float().to(self._device)
        actions = torch.from_numpy(actions).long().to(self._device)

        self._model.train()
        self._optimizer.zero_grad()
        predictions = self._model(states)
        predictions = torch.gather(predictions, dim=1, index=actions)
        loss = self._loss(predictions, q_values)
        loss.backward()
        #nn.utils.clip_grad_norm_(self._model.parameters(), 10)
        self._optimizer.step()

    def predict(self, states: np.ndarray) -> np.ndarray:
        states = torch.from_numpy(states).float().to(self._device)
        self._model.eval()
        with torch.no_grad():
            return self._model(states).cpu().numpy()

    def copy_weights_from(self, other) -> None:
        params = dict(self._model.named_parameters())
        params_other = dict(other._model.named_parameters())
        with torch.no_grad():
            for name, value in params_other.items():
                params[name].data.copy_(value.data)

In [11]:
import collections
import random

network = Network()

epsilon = 0.5
gamma = 0.99

replay_buffer = collections.deque()

Transition = collections.namedtuple("Transition", ["state", "action", "reward", "done", "next_state"])

for episode in range(20):
    state, done = env.reset()
    episode_return = 0

    while not done:
        with torch.no_grad():
            q_values = network.predict(np.array([state], dtype=np.float32))[0]
        if np.random.uniform() >= epsilon:
            action = np.argmax(q_values)
        else:
            action = np.random.randint(0, 3)

        next_state, reward, done = env.step(action)
        episode_return += reward
        replay_buffer.append(Transition(state, action, reward, done, next_state))


        if len(replay_buffer) > 512:
            minibatch = random.sample(replay_buffer, 512)
            states = np.vstack([t.state for t in minibatch])
            actions = np.vstack([t.action for t in minibatch])
            rewards = np.vstack([t.reward for t in minibatch])
            next_states = np.vstack([t.next_state for t in minibatch])
            dones = np.vstack([t.done for t in minibatch]).astype(np.uint8)

            with torch.no_grad():
                q_values_next = network.predict(next_states)
                q_values_next = q_values_next.max(axis=1).reshape(-1, 1)
                target_q_values = rewards + (1 - dones) * gamma * q_values_next
            network.train(states, actions, target_q_values)

        state = next_state
    print(f'Episode: {episode} return: {episode_return}')

Episode: 0 return: 224.99744838047303
Episode: 1 return: 264.531597508022
Episode: 2 return: 10.526600750194191
Episode: 3 return: 463.344433536383
Episode: 4 return: 107.36902379122189
Episode: 5 return: 659.0909868725554
Episode: 6 return: 93.25250433980881
Episode: 7 return: -754.3228877029144
Episode: 8 return: -12.09655519360639
Episode: 9 return: -4.448079854905141
Episode: 10 return: 404.5836446362279
Episode: 11 return: 202.75678159449467
Episode: 12 return: -403.27718192251444
Episode: 13 return: -319.655870002426
Episode: 14 return: -126.48002786972344
Episode: 15 return: -267.77574004643776
Episode: 16 return: 175.35113259349407
Episode: 17 return: 712.920821829259
Episode: 18 return: 370.2768818837727
Episode: 19 return: -240.6320419818552


In [12]:
4261.48 - 4280.56

-19.080000000000837

In [34]:
env.reset()

(array([ 3829.47615283,   872.362006  ,  3829.47615283,  3829.47615283,
            0.        ,    50.        ,  3829.47615283, 10000.        ,
            0.        ]),
 {})

In [89]:
env.step(1)

(array([ 3.80494806e+03,  3.85813469e+02,  3.80876260e+03,  3.81634181e+03,
        -6.61621262e-01,  4.21757695e+01,  3.81111808e+03,  0.00000000e+00,
         2.62047935e+00]),
 -13.047519166573693,
 False,
 False,
 {})

In [1]:
import pandas as pd

prices_df = pd.read_hdf('data/binance_BTC_USDT.h5')


Unnamed: 0_level_0,amount,price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1502942460,1.775183,4261.480000
1502942580,0.261074,4280.560000
1502942640,0.012008,4261.480000
1502942700,0.140796,4261.480000
1502943480,0.075455,4262.187216
...,...,...
1670479020,140.372990,16822.509019
1670479080,135.652550,16823.593779
1670479140,106.761210,16823.667239
1670479200,160.929330,16820.593849


In [22]:
prices_df.index = pd.to_datetime(prices_df.index * 1e9)
prices_df

Unnamed: 0_level_0,amount,price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-17 04:01:00,1.775183,4261.480000
2017-08-17 04:03:00,0.261074,4280.560000
2017-08-17 04:04:00,0.012008,4261.480000
2017-08-17 04:05:00,0.140796,4261.480000
2017-08-17 04:18:00,0.075455,4262.187216
...,...,...
2022-12-08 05:57:00,140.372990,16822.509019
2022-12-08 05:58:00,135.652550,16823.593779
2022-12-08 05:59:00,106.761210,16823.667239
2022-12-08 06:00:00,160.929330,16820.593849


In [28]:
hourly_prices = prices_df.groupby(pd.Grouper(freq='H')).agg({'amount': 'sum', 'price': 'last'})

In [99]:
hourly_prices['price'] = hourly_prices['price'].ffill()

In [91]:

            #time.sleep(0.01)

env = StockExchangeEnv(hourly_prices['price'].to_numpy(), hourly_prices['amount'].to_numpy(), 10_000)

env.reset()
for i in range(100):
    env.render()
    current_step = env.current_step
    current_price = env.price_data[current_step]
    next_price = env.price_data[current_step + 1]
    if next_price > current_price:
        env.step(1)
    elif next_price < current_price:
        env.step(2)
    else:
        env.step(0)

# for i in range(100):
#     env.render()
#     current_step = env.current_step
#     current_price = env.price_data[current_step]
#     next_price = env.price_data[current_step + 1]
#     try:
#         next_next_price = env.price_data[current_step + 2]
#     except IndexError:
#         next_next_price = 0
#     print(f'{current_price=:.2f}\t{next_price=:.2f}\t{next_next_price=:.2f}')
#     print('Enter next action: 0 HOLD, 1 BUY, 2 SELL\n')
#     time.sleep(0.1)
#     action = int(input())
#     env.step(action)
plt.show()

Using matplotlib backend: TkAgg


2023-05-06 16:36:43,057	INFO worker.py:1616 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


NameError: name 'hourly_prices' is not defined

2023-05-05 23:20:54,623	INFO worker.py:1616 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m




Step: 1	Mean return: -42.865930799546184
Step: 2	Mean return: -185.89766800128746
Step: 3	Mean return: 169.17498209446853
Step: 4	Mean return: -88.3142230583735
Step: 5	Mean return: 19.19788326543157
Step: 6	Mean return: -16.45068610735584
Step: 7	Mean return: 7.063555817371994
Step: 8	Mean return: -38.02530962088903
Step: 9	Mean return: -41.522897312310235
Step: 10	Mean return: -17.97407142151989
Step: 11	Mean return: -19.678367718046115
Step: 12	Mean return: 11.896916841168824
Step: 13	Mean return: -2.235516732331765
Step: 14	Mean return: 12.66614309286765
Step: 15	Mean return: -4.661710404750021
Step: 16	Mean return: -16.956172854057762
Step: 17	Mean return: -11.008859576698496
Step: 18	Mean return: 2.480511230930242
Step: 19	Mean return: -0.4883398345264868
Step: 20	Mean return: 26.459753117181773
Step: 21	Mean return: -44.16181603506087
Step: 22	Mean return: 20.183706555358384
Step: 23	Mean return: -3.228635785139504
Step: 24	Mean return: 24.918146317321316
Step: 25	Mean return: 8

Exception ignored in: <function Variable.__del__ at 0x7f17eafc2950>
Traceback (most recent call last):
  File "/home/fassty/anaconda3/envs/diploma_thesis/lib/python3.10/tkinter/__init__.py", line 388, in __del__
    if self._tk.getboolean(self._tk.call("info", "exists", self._name)):
RuntimeError: main thread is not in main loop


Collecting mujoco-py<2.2,>=2.1
  Downloading mujoco_py-2.1.2.14-py3-none-any.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting mujoco==2.2
  Downloading mujoco-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting gym==0.26.2
  Using cached gym-0.26.2.tar.gz (721 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting box2d-py==2.3.5
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25h

In [100]:
hourly_prices['price'].isna().any()

False