In [1]:
import shutil, os
if os.path.isdir('rainbow') : shutil.rmtree("rainbow")
!git clone https://github.com/ClementPerroud/Rainbow-Agent rainbow

Cloning into 'rainbow'...
remote: Enumerating objects: 121, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 121 (delta 79), reused 69 (delta 33), pack-reused 0[K
Receiving objects: 100% (121/121), 43.74 KiB | 2.30 MiB/s, done.
Resolving deltas: 100% (79/79), done.


In [2]:
!pip install --upgrade gym-trading-env tensorflow-addons scikit-learn

Collecting gym-trading-env
  Downloading gym_trading_env-0.2.3-py3-none-any.whl (16 kB)
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (591 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m591.0/591.0 kB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m102.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pandas>=1.5.3
  Downloading pandas-2.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting numpy>=1.24.2
  Downloading numpy-1.24.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
from sklearn.preprocessing import robust_scale

from rainbow.agent import Rainbow

import sys
import gym_trading_env
import nest_asyncio



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.9.2 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


# Create environments

In [4]:
def add_features(df):
    df["feature_close"] = robust_scale(df["close"].pct_change())
    df["feature_open"] = robust_scale(df["open"]/df["close"])
    df["feature_high"] = robust_scale(df["high"]/df["close"])
    df["feature_low"] = robust_scale(df["low"]/df["close"])
    df["feature_volume"] = robust_scale(df["volume"] / df["volume"].rolling(7*24).max())
    df.dropna(inplace= True) # Clean your data !
    return df


def reward_function(history):
    return 800*np.log(history["portfolio_valuation", -1] / history["portfolio_valuation", -2]) #log (p_t / p_t-1 )

def max_drawdown(history):
    networth_array = history['portfolio_valuation']
    _max_networth = networth_array[0]
    _max_drawdown = 0
    for networth in networth_array:
        if networth > _max_networth:
            _max_networth = networth
        drawdown = ( networth - _max_networth ) / _max_networth
        if drawdown < _max_drawdown:
            _max_drawdown = drawdown
    return f"{_max_drawdown*100:5.2f}%"

def make_env(dir):
    env = gym.make(
        "MultiDatasetTradingEnv",
        
        
        dataset_dir= dir,
        preprocess= add_features,
        windows= 15,
        positions = [ -1, -0.5, 0, 1, 2], # From -1 (=SHORT), to +1 (=LONG)
        initial_position = 0,
        trading_fees = 0.01/100, # 0.01% per stock buy / sell (Binance fees)
        borrow_interest_rate= 0.0003/100, # 0.0003% per timestep (= 1h here)
        reward_function = reward_function,
        portfolio_initial_value = 1000, # here, in USDT
        
        verbose= 1,
    )
    env.add_metric('Position Changes', lambda history : f"{ 100*np.sum(np.diff(history['position']) != 0)/len(history['position']):5.2f}%" )
    env.add_metric('Max Drawdown', max_drawdown)
    return env


training_envs = gym.vector.SyncVectorEnv([lambda: make_env("/notebooks/data/processed/training/*.pkl") for _ in range(5)])
validation_envs = gym.vector.SyncVectorEnv([lambda: make_env("/notebooks/data/processed/validation/*.pkl") for _ in range(5)])


In [5]:
agent = Rainbow(
    simultaneous_training_env = 5,
    
    #Distributional
    distributional= True,
    v_min= -200,
    v_max = 250,
    nb_atoms= 51, 
    # Prioritized Replay
    prioritized_replay = False,
    prioritized_replay_alpha= 0.5,
    prioritized_replay_beta_function = lambda episode, step : min(1, 0.5 + 0.5*step/150_000),
    
    # General
    multi_steps = 3,
    nb_states = 6,
    nb_actions = 4,
    gamma = 0.99,
    replay_capacity = 1E8,
    tau = 2000,
    
    # Model
    window= 15,
    units = [16,16, 16],
    dropout= 0.2,
    adversarial= True,
    noisy= False,
    learning_rate = 3*2.5E-4,

    batch_size= 128,
    train_every = 10,
    epsilon_function = lambda episode, step : max(0.001, (1 - 5E-5)** step),
    name = "Rainbow",
)

In [6]:
def train(steps = 100_000):
    print("___________________________________________ TRAINING ___________________________________________")
    if 'obs' not in globals():
        global obs
        obs, info = training_envs.reset()
    for _ in range(steps):
        actions = agent.e_greedy_pick_actions_or_random(obs)
        next_obs, rewards, dones, truncateds, infos = training_envs.step(actions)

        agent.store_replays(obs, actions, rewards, next_obs, dones, truncateds)
        agent.train()

        obs = next_obs

def evaluation():
    print("___________________________________________ VALIDATION ___________________________________________")
    val_obs, info = validation_envs.reset()
    check = np.array([False for _ in range(val_obs.shape[0])])
    while not np.all(check):
        actions = agent.e_greedy_pick_actions_or_random(val_obs)
        next_obs, rewards, dones, truncateds, infos = validation_envs.step(actions)
        val_obs = next_obs
        check += dones + truncateds

In [None]:
while True:
    train(steps = 30_000)
    evaluation()

___________________________________________ TRAINING ___________________________________________
___________________________________________ VALIDATION ___________________________________________
Market Return : -36.93%   |   Portfolio Return : -29.94%   |   Position Changes : 32.31%   |   Max Drawdown : -53.06%
Market Return : -24.83%   |   Portfolio Return : -37.88%   |   Position Changes : 32.38%   |   Max Drawdown : -48.07%
Market Return : -24.83%   |   Portfolio Return : -27.37%   |   Position Changes : 32.37%   |   Max Drawdown : -45.06%
Market Return : -36.93%   |   Portfolio Return : -46.70%   |   Position Changes : 32.15%   |   Max Drawdown : -59.95%
Market Return : -24.98%   |   Portfolio Return : -42.31%   |   Position Changes : 31.96%   |   Max Drawdown : -50.17%
___________________________________________ TRAINING ___________________________________________
Market Return : 1896.93%   |   Portfolio Return : -73.74%   |   Position Changes : 39.31%   |   Max Drawdown : -88.44

In [None]:
import dill, pickle
#agent.model = None
#agent.target_model = None
#agent.replay_memory = None

with open("test.pkl", "wb") as file:
    dill.dump(agent, file)

In [None]:
batch_indexes, states, actions, rewards, states_prime, dones, importance_weights = agent.replay_memory.sample(
    256,
    agent.prioritized_replay_beta_function(agent.episode_count, agent.steps)
)
results = agent.model(states)

action_colors=["blue", "orange","purple","red"]
fig, axes = plt.subplots(nrows = 2, ncols = 2, figsize=(16,9), dpi=300)
for action in range(4):
    for i in range(256):
        axes[action%2, action//2%2].plot(agent.zs, results[i, action, :], color = action_colors[action], alpha = 0.2)

In [None]:
batch_indexes, states, actions, rewards, states_prime, dones, importance_weights = agent.replay_memory.sample(
    256,
    agent.prioritized_replay_beta_function(agent.episode_count, agent.steps)
)
results = agent.model(states)

action_colors=["blue", "orange","purple","red"]
fig, axes = plt.subplots(nrows = 2, ncols = 2, figsize=(16,9), dpi=300)
for action in range(4):
    for i in range(1):
        axes[action%2, action//2%2].plot(agent.zs, results[i, action, :], color = action_colors[action], alpha = 0.2)