In [1]:
%load_ext autoreload
%autoreload 2
import ray
ray.init()

2023-05-08 22:44:30,313	INFO worker.py:1616 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


0,1
Python version:,3.10.11
Ray version:,2.4.0
Dashboard:,http://127.0.0.1:8266


In [2]:
ray.available_resources()

{'accelerator_type:G': 1.0,
 'node:192.168.0.222': 1.0,
 'object_store_memory': 10524375859.0,
 'CPU': 6.0,
 'GPU': 1.0,
 'memory': 21048751719.0}

In [2]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from rl_trading.simulation.env import StockExchangeEnv

import pandas as pd
import numpy as np
exchange_data = pd.read_hdf('../data/binance_BTC_USDT.h5')
exchange_data.sort_index(inplace=True)
exchange_data = exchange_data[~exchange_data.index.duplicated(keep='first')]
exchange_data = exchange_data.reindex(np.arange(exchange_data.index[0], exchange_data.index[-1] + 1, 60))
exchange_data['price'] = exchange_data['price'].ffill()
exchange_data['amount'] = exchange_data['amount'].fillna(value=0)
exchange_data.index = pd.to_datetime(exchange_data.index * 1e9)

algo = (
    PPOConfig()
    .rollouts(num_rollout_workers=0)
    .resources(num_gpus=1)
    .environment(env='StockExchangeEnv-v0', is_atari=False)
    .build()
)
import time

start_time = time.time()

for i in range(10):
    result = algo.train()
    print(pretty_print(result))
print(f'Took: {time.time() - start_time} seconds and done {result["timesteps_total"]} steps')

KeyboardInterrupt: 

In [4]:
from ray import tune, air

config = (
    PPOConfig()
    .rollouts(num_rollout_workers=1, num_envs_per_worker=4)
    .resources(num_gpus=0.25)
    .training(entropy_coeff=tune.grid_search([0.5, 0.1, 0.01, 0.001]))
    .environment(env='StockExchangeEnv-v0')
)

tuner = tune.Tuner(
    "PPO",
    run_config=air.RunConfig(
        name='PPO_default_5samples',
        local_dir='../exp_results/PPO/entropy_coeff',
        stop={"training_iteration": 250},
    ),
    tune_config=tune.TuneConfig(
        num_samples=5
    ),
    param_space=config,
)

tuner.fit()

[2m[36m(PPO pid=2185768)[0m 2023-05-07 21:28:57,515	INFO algorithm.py:527 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPO pid=2186231)[0m 2023-05-07 21:29:06,150	INFO algorithm.py:527 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPO pid=2186625)[0m 2023-05-07 21:29:14,626	INFO algorithm.py:527 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,agent_timesteps_total,connector_metrics,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_trained,num_env_steps_trained_this_iter,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_total,training_iteration,trial_id
PPO_StockExchangeEnv-v0_66e97_00000,4000,{},"{'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}",{},2023-05-07_21-29-12,False,,{},,,,0,0,seymour,"{'learner': {'default_policy': {'custom_metrics': {}, 'learner_stats': {'cur_kl_coeff': 0.20000000000000004, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 8.917150435909148, 'policy_loss': -0.001974853827187451, 'vf_loss': 9.467514173446164, 'vf_explained_var': 5.3026727450791225e-05, 'kl': 0.0013243350053721036, 'entropy': 1.097307566545343, 'entropy_coeff': 0.5}, 'model': {}, 'num_grad_updates_lifetime': 465.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}",1,192.168.0.222,4000,4000,4000,4000,4000,4000,0,1,0,0,4000,"{'cpu_util_percent': 59.00714285714286, 'ram_util_percent': 43.800000000000004, 'gpu_util_percent0': 0.6042857142857143, 'vram_util_percent0': 0.10804385230654764}",2185768,{},{},{},{},"{'episode_reward_max': nan, 'episode_reward_min': nan, 'episode_reward_mean': nan, 'episode_len_mean': nan, 'episode_media': {}, 'episodes_this_iter': 0, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [], 'episode_lengths': []}, 'sampler_perf': {}, 'num_faulty_episodes': 0, 'connector_metrics': {}}",9.78512,9.78512,9.78512,"{'training_iteration_time_ms': 9781.087, 'sample_time_ms': 2820.071, 'load_time_ms': 1.235, 'load_throughput': 3238219.649, 'learn_time_ms': 6957.131, 'learn_throughput': 574.95, 'synch_weights_time_ms': 2.069}",1683487752,4000,1,66e97_00000


[2m[36m(PPO pid=2289250)[0m 2023-05-07 22:12:32,340	INFO algorithm.py:527 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPO pid=2290270)[0m 2023-05-07 22:12:53,323	INFO algorithm.py:527 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPO pid=2391741)[0m 2023-05-07 22:55:48,113	INFO algorithm.py:527 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPO pid=2392242)[0m 2023-05-07 22:55:58,011	INFO algorithm.py:527 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPO pid=2393379)[0m 2023-05-07 22:56:23,137	INFO algorithm.py:527 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPO pid=2498759)[0m 2023-05-0

ResultGrid<[
  Result(
    metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'custom_metrics': {}, 'learner_stats': {'cur_kl_coeff': 3.5873240686715324e-44, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 8.792393037324311, 'policy_loss': -0.004107534697139135, 'vf_loss': 9.345290321944862, 'vf_explained_var': -0.0008485437721334478, 'kl': 0.004602599609764487, 'entropy': 1.0975795298494317, 'entropy_coeff': 0.5}, 'model': {}, 'num_grad_updates_lifetime': 232035.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 1000000, 'num_env_steps_trained': 1000000, 'num_agent_steps_sampled': 1000000, 'num_agent_steps_trained': 1000000}, 'sampler_results': {'episode_reward_max': 857.6332999966362, 'episode_reward_min': -774.1790741283694, 'episode_reward_mean': -8.017134437299992, 'episode_len_mean': 1440.0, 'episode_media': {}, 'episodes_this_iter': 4, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {},

In [3]:
%matplotlib
import numpy as np
from rl_trading.simulation.env import StockExchangeEnv1

env = StockExchangeEnv1(sim_config={'granularity': '1d', 'max_steps': 30})
done = False
env.reset()
while not done:
    env.render()
    state, reward, done, _, _ = env.step(np.random.randint(3))

Using matplotlib backend: TkAgg


invalid command name "139821056204480delayed_destroy"
    while executing
"139821056204480delayed_destroy"
    ("after" script)
invalid command name "139820310905664delayed_destroy"
    while executing
"139820310905664delayed_destroy"
    ("after" script)
invalid command name "139820286114816delayed_destroy"
    while executing
"139820286114816delayed_destroy"
    ("after" script)
invalid command name "139820468975616delayed_destroy"
    while executing
"139820468975616delayed_destroy"
    ("after" script)
invalid command name "139821116533952delayed_destroy"
    while executing
"139821116533952delayed_destroy"
    ("after" script)
invalid command name "139820468189696delayed_destroy"
    while executing
"139820468189696delayed_destroy"
    ("after" script)
invalid command name "139820468290112delayed_destroy"
    while executing
"139820468290112delayed_destroy"
    ("after" script)


In [12]:
!pip install "gymnasium[classic_control]"

Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.1.0
