Skip to content

Commit

Permalink
Merge pull request #189 from AI4Finance-Foundation/Fix-issues-#169,-#184
Browse files Browse the repository at this point in the history
,-#186,-#188

Fix issues #169, #184, #186, #188
  • Loading branch information
supersglzc committed Jul 21, 2022
2 parents 68e1174 + 3eec288 commit ebb9a59
Show file tree
Hide file tree
Showing 12 changed files with 122 additions and 79 deletions.
2 changes: 1 addition & 1 deletion docs/source/algorithms/a2c.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Code Snippet
import torch
from elegantrl.run import train_and_evaluate
from elegantrl.config import Arguments
from elegantrl.envs.gym import build_env
from elegantrl.train.config import build_env
from elegantrl.agents.AgentA2C import AgentA2C
# train and save
Expand Down
2 changes: 1 addition & 1 deletion docs/source/algorithms/ddpg.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Code Snippet
import torch
from elegantrl.run import train_and_evaluate
from elegantrl.config import Arguments
from elegantrl.envs.gym import build_env
from elegantrl.train.config import build_env
from elegantrl.agents.AgentDDPG import AgentDDPG
# train and save
Expand Down
2 changes: 1 addition & 1 deletion docs/source/algorithms/double_dqn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Code Snippet
import torch
from elegantrl.run import train_and_evaluate
from elegantrl.config import Arguments
from elegantrl.envs.gym import build_env
from elegantrl.train.config import build_env
from elegantrl.agents.AgentDoubleDQN import AgentDoubleDQN
# train and save
Expand Down
2 changes: 1 addition & 1 deletion docs/source/algorithms/dqn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Code Snippet
import torch
from elegantrl.run import train_and_evaluate
from elegantrl.config import Arguments
from elegantrl.envs.gym import build_env
from elegantrl.train.config import build_env
from elegantrl.agents.AgentDQN import AgentDQN
# train and save
Expand Down
2 changes: 1 addition & 1 deletion docs/source/algorithms/ppo.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Code Snippet
import torch
from elegantrl.run import train_and_evaluate
from elegantrl.config import Arguments
from elegantrl.envs.gym import build_env
from elegantrl.train.config import build_env
from elegantrl.agents.AgentPPO import AgentPPO
# train and save
Expand Down
2 changes: 1 addition & 1 deletion docs/source/algorithms/redq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Code Snippet
import torch
from elegantrl.run import train_and_evaluate
from elegantrl.config import Arguments
from elegantrl.envs.gym import build_env
from elegantrl.train.config import build_env
from elegantrl.agents.AgentREDQ import AgentREDQ
# train and save
Expand Down
2 changes: 1 addition & 1 deletion docs/source/algorithms/sac.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Code Snippet
import torch
from elegantrl.run import train_and_evaluate
from elegantrl.config import Arguments
from elegantrl.envs.gym import build_env
from elegantrl.train.config import build_env
from elegantrl.agents.AgentSAC import AgentSAC
# train and save
Expand Down
2 changes: 1 addition & 1 deletion docs/source/algorithms/td3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Code Snippet
import torch
from elegantrl.run import train_and_evaluate
from elegantrl.config import Arguments
from elegantrl.envs.gym import build_env
from elegantrl.train.config import build_env
from elegantrl.agents.AgentTD3 import AgentTD3
# train and save
Expand Down
2 changes: 1 addition & 1 deletion docs/source/api/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ The class should be initialized at the start of the training process. For exampl
from elegantrl.train.config import Arguments
from elegantrl.agents.AgentPPO import AgentPPO
from elegantrl.envs.Gym import build_env
from elegantrl.train.config import build_env
import gym
args = Arguments(build_env('Pendulum-v1'), AgentPPO())
Expand Down
91 changes: 46 additions & 45 deletions elegantrl/agents/AgentPPO.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from typing import Tuple

import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
from torch import Tensor
from typing import List, Tuple
from elegantrl.agents.net import ActorPPO, ActorDiscretePPO, CriticPPO, SharePPO

from elegantrl.agents.AgentBase import AgentBase
from elegantrl.train.replay_buffer import ReplayBufferList, ReplayBuffer
from elegantrl.train.config import Arguments #bug fix:NameError: name 'Arguments' is not defined def __init__(self, net_dim: int, state_dim: int, action_dim: int, gpu_id: int = 0, args: Arguments = None):
from typing import Tuple
from elegantrl.agents.net import ActorPPO, ActorDiscretePPO, CriticPPO, SharePPO
from elegantrl.train.config import \
Arguments # bug fix:NameError: name 'Arguments' is not defined def __init__(self, net_dim: int, state_dim: int, action_dim: int, gpu_id: int = 0, args: Arguments = None):
from elegantrl.train.replay_buffer import ReplayBufferList

"""[ElegantRL.2021.12.12](github.com/AI4Fiance-Foundation/ElegantRL)"""

Expand All @@ -29,7 +29,7 @@ class AgentPPO(AgentBase):
"""

def __init__(
self, net_dim: int, state_dim: int, action_dim: int, gpu_id=0, args=None
self, net_dim: int, state_dim: int, action_dim: int, gpu_id=0, args=None
):
self.if_off_policy = False
self.act_class = getattr(self, "act_class", ActorPPO)
Expand All @@ -48,7 +48,7 @@ def __init__(
) # could be 0.95~0.99, GAE (ICLR.2016.)

if getattr(
args, "if_use_gae", False
args, "if_use_gae", False
): # GAE (Generalized Advantage Estimation) for sparse reward
self.get_reward_sum = self.get_reward_sum_gae
else:
Expand Down Expand Up @@ -87,7 +87,7 @@ def explore_one_env(self, env, target_step, random_exploration=None) -> list:
last_done[0] = step_i
return self.convert_trajectory(traj_list, last_done) # traj_list

def explore_vec_env(self, env, target_step, random_exploration = None) -> list:
def explore_vec_env(self, env, target_step, random_exploration=None) -> list:
"""
Collect trajectories through the actor-environment interaction for a **vectorized** environment instance.
Expand Down Expand Up @@ -138,9 +138,9 @@ def update_net(self, buffer):
buf_len = buf_state.shape[0]

"""get buf_r_sum, buf_logprob"""
bs = 2**10 # set a smaller 'BatchSize' when out of GPU memory.
bs = 2 ** 10 # set a smaller 'BatchSize' when out of GPU memory.
buf_value = [
self.cri_target(buf_state[i : i + bs]) for i in range(0, buf_len, bs)
self.cri_target(buf_state[i: i + bs]) for i in range(0, buf_len, bs)
]
buf_value = torch.cat(buf_value, dim=0)
buf_logprob = self.act.get_old_logprob(buf_action, buf_noise)
Expand Down Expand Up @@ -193,7 +193,7 @@ def update_net(self, buffer):
return obj_critic.item(), -obj_actor.item(), a_std_log.item() # logging_tuple

def get_reward_sum_raw(
self, buf_len, buf_reward, buf_mask, buf_value
self, buf_len, buf_reward, buf_mask, buf_value
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Calculate the **reward-to-go** and **advantage estimation**.
Expand All @@ -216,7 +216,7 @@ def get_reward_sum_raw(
return buf_r_sum, buf_adv_v

def get_reward_sum_gae(
self, buf_len, ten_reward, ten_mask, ten_value
self, buf_len, ten_reward, ten_mask, ten_value
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Calculate the **reward-to-go** and **advantage estimation** using GAE.
Expand Down Expand Up @@ -260,7 +260,7 @@ class AgentDiscretePPO(AgentPPO):
"""

def __init__(
self, net_dim: int, state_dim: int, action_dim: int, gpu_id=0, args=None
self, net_dim: int, state_dim: int, action_dim: int, gpu_id=0, args=None
):
self.act_class = getattr(self, "act_class", ActorDiscretePPO)
self.cri_class = getattr(self, "cri_class", CriticPPO)
Expand All @@ -274,16 +274,16 @@ def __init__(self):
self.obj_c = (-np.log(0.5)) ** 0.5 # for reliable_lambda

def init(
self,
net_dim=256,
state_dim=8,
action_dim=2,
reward_scale=1.0,
gamma=0.99,
learning_rate=1e-4,
if_per_or_gae=False,
env_num=1,
gpu_id=0,
self,
net_dim=256,
state_dim=8,
action_dim=2,
reward_scale=1.0,
gamma=0.99,
learning_rate=1e-4,
if_per_or_gae=False,
env_num=1,
gpu_id=0,
):
self.device = torch.device(
f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu"
Expand Down Expand Up @@ -324,9 +324,9 @@ def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
# (ten_state, ten_action, ten_noise, ten_reward, ten_mask) = buffer

"""get buf_r_sum, buf_logprob"""
bs = 2**10 # set a smaller 'BatchSize' when out of GPU memory.
bs = 2 ** 10 # set a smaller 'BatchSize' when out of GPU memory.
buf_value = [
self.cri_target(buf_state[i : i + bs]) for i in range(0, buf_len, bs)
self.cri_target(buf_state[i: i + bs]) for i in range(0, buf_len, bs)
]
buf_value = torch.cat(buf_value, dim=0)
buf_logprob = self.act.get_old_logprob(buf_action, buf_noise)
Expand All @@ -335,7 +335,7 @@ def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
buf_len, buf_reward, buf_mask, buf_value
) # detach()
buf_adv_v = (buf_adv_v - buf_adv_v.mean()) * (
self.lambda_a_value / torch.std(buf_adv_v) + 1e-5
self.lambda_a_value / torch.std(buf_adv_v) + 1e-5
)
# buf_adv_v: buffer data of adv_v value
del buf_noise, buffer[:]
Expand Down Expand Up @@ -540,7 +540,7 @@ class AgentDiscretePPO(AgentPPO):
"""

def __init__(
self, net_dim: int, state_dim: int, action_dim: int, gpu_id=0, args=None
self, net_dim: int, state_dim: int, action_dim: int, gpu_id=0, args=None
):
self.act_class = getattr(self, "act_class", ActorDiscretePPO)
self.cri_class = getattr(self, "cri_class", CriticPPO)
Expand All @@ -554,16 +554,16 @@ def __init__(self):
self.obj_c = (-np.log(0.5)) ** 0.5 # for reliable_lambda

def init(
self,
net_dim=256,
state_dim=8,
action_dim=2,
reward_scale=1.0,
gamma=0.99,
learning_rate=1e-4,
if_per_or_gae=False,
env_num=1,
gpu_id=0,
self,
net_dim=256,
state_dim=8,
action_dim=2,
reward_scale=1.0,
gamma=0.99,
learning_rate=1e-4,
if_per_or_gae=False,
env_num=1,
gpu_id=0,
):
self.device = torch.device(
f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu"
Expand Down Expand Up @@ -604,9 +604,9 @@ def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
# (ten_state, ten_action, ten_noise, ten_reward, ten_mask) = buffer

"""get buf_r_sum, buf_logprob"""
bs = 2**10 # set a smaller 'BatchSize' when out of GPU memory.
bs = 2 ** 10 # set a smaller 'BatchSize' when out of GPU memory.
buf_value = [
self.cri_target(buf_state[i : i + bs]) for i in range(0, buf_len, bs)
self.cri_target(buf_state[i: i + bs]) for i in range(0, buf_len, bs)
]
buf_value = torch.cat(buf_value, dim=0)
buf_logprob = self.act.get_old_logprob(buf_action, buf_noise)
Expand All @@ -615,7 +615,7 @@ def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
buf_len, buf_reward, buf_mask, buf_value
) # detach()
buf_adv_v = (buf_adv_v - buf_adv_v.mean()) * (
self.lambda_a_value / torch.std(buf_adv_v) + 1e-5
self.lambda_a_value / torch.std(buf_adv_v) + 1e-5
)
# buf_adv_v: buffer data of adv_v value
del buf_noise, buffer[:]
Expand Down Expand Up @@ -654,11 +654,13 @@ def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
a_std_log = getattr(self.act, "a_std_log", torch.zeros(1)).mean()
return obj_critic.item(), obj_actor.item(), a_std_log.item() # logging_tuple


class AgentPPOHterm(AgentPPO):
def __init__(self, net_dim: int, state_dim: int, action_dim: int, gpu_id: int = 0, args: Arguments = None):
def __init__(self, net_dim: int, state_dim: int, action_dim: int, gpu_id: int = 0, args=None):
AgentPPO.__init__(self, net_dim, state_dim, action_dim, gpu_id, args)

def update_net(self, buffer: ReplayBufferList):#bug fix:ImportError: cannot import name 'ReplayBufferList' from 'elegantrl.train.replay_buffer'
def update_net(self,
buffer: ReplayBufferList): # bug fix:ImportError: cannot import name 'ReplayBufferList' from 'elegantrl.train.replay_buffer'
with torch.no_grad():
buf_state, buf_reward, buf_mask, buf_action, buf_noise = [ten.to(self.device) for ten in buffer]
buf_len = buf_state.shape[0]
Expand Down Expand Up @@ -708,4 +710,3 @@ def update_net(self, buffer: ReplayBufferList):#bug fix:ImportError: cannot impo

action_std_log = getattr(self.act, 'action_std_log', torch.zeros(1)).mean()
return obj_critic.item(), -obj_actor.item(), action_std_log.item() # logging_tuple

56 changes: 31 additions & 25 deletions elegantrl/train/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
'''config for agent'''



class Arguments:
def __init__(self, agent_class=None, env=None, env_func=None, env_args: dict = None):
def __init__(self, agent, env=None, env_func=None, env_args=None):
self.env = env # the environment for training
self.eval_env = None
self.env_func = env_func # env = env_func(*env_args)
self.env_args = env_args # env = env_func(*env_args)
self.if_Isaac = True

self.env_num = self.update_attr('env_num') # env_num = 1. In vector env, env_num > 1.
self.max_step = self.update_attr('max_step') # the max step of an episode
Expand All @@ -22,34 +23,40 @@ def __init__(self, agent_class=None, env=None, env_func=None, env_args: dict = N
self.if_discrete = self.update_attr('if_discrete') # discrete or continuous action space
self.target_return = self.update_attr('target_return') # target average episode return

self.agent_class = agent_class # the class of DRL algorithm
self.net_dim = 2 ** 8 # the network width
self.num_layer = 3 # layer number of MLP (Multi-layer perception, `assert layer_num>=2`)
self.agent = agent # DRL algorithm
self.net_dim = 2 ** 7 # the network width
self.num_layer = 2 # layer number of MLP (Multi-layer perception, `assert layer_num>=2`)
self.if_off_policy = self.get_if_off_policy() # agent is on-policy or off-policy
self.if_use_old_traj = True # continue the last exploration
self.obs_norm = False
self.value_norm = False
self.if_act_target = False
self.if_cri_target = False
if self.if_off_policy: # off-policy
self.max_memo = 2 ** 21 # capacity of replay buffer, 2 ** 21 ~= 2e6
self.target_step = 2 ** 10 # repeatedly update network to keep critic's loss small
self.replay_buffer_size = 1e6 # capacity of replay buffer
self.horizon_len = 1 # number of steps per exploration
self.batch_size = self.net_dim # num of transitions sampled from replay buffer.
self.repeat_times = 2 ** 0 # collect target_step, then update network
self.repeat_times = 2 ** 0 # epoch num
self.if_use_per = False # use PER (Prioritized Experience Replay) for sparse reward
self.num_seed_steps = 2 # the total samples for warm-up is num_seed_steps * env_num * num_steps_per_episode
self.num_steps_per_episode = 128
self.n_step = 1 # multi-step TD learning
else: # on-policy
self.max_memo = 2 ** 12 # capacity of replay buffer
self.target_step = self.max_memo # repeatedly update network to keep critic's loss small
self.batch_size = self.net_dim * 2 # num of transitions sampled from replay buffer.
self.repeat_times = 2 ** 4 # collect target_step, then update network
self.if_use_gae = False # use PER: GAE (Generalized Advantage Estimation) for sparse reward
self.horizon_len = 8 # number of steps per exploration
self.batch_size = 2048 # batch size
self.repeat_times = 5 # epoch num
self.if_use_gae = True # use GAE (Generalized Advantage Estimation) for sparse reward
self.lambda_gae_adv = 0.95
self.lambda_entropy = 0.0

'''Arguments for training'''
self.gamma = 0.99 # discount factor of future rewards
self.reward_scale = 2 ** 0 # an approximate target reward usually be closed to 256
self.lambda_critic = 2 ** 0 # the objective coefficient of critic network
self.learning_rate = 2 ** -15 # 2 ** -15 ~= 3e-5
self.soft_update_tau = 2 ** -8 # 2 ** -8 ~= 5e-3
self.clip_grad_norm = 3.0 # 0.1 ~ 4.0, clip the gradient after normalization
self.if_off_policy = self.if_off_policy() # agent is on-policy or off-policy
self.if_use_old_traj = False # save old data to splice and get a complete trajectory (for vector env)
self.learning_rate = 2 ** -12 # 2 ** -15 ~= 3e-5
self.soft_update_tau = 0.1 # 2 ** -8 ~= 5e-3

'''Arguments for device'''
self.worker_num = 2 # rollout workers number pre GPU (adjust it to get high GPU usage)
self.worker_num = 1 # rollout workers number pre GPU (adjust it to get high GPU usage)
self.thread_num = 8 # cpu_num for pytorch, `torch.set_num_threads(self.num_threads)`
self.random_seed = 0 # initialize random seed in self.init_before_training()
self.learner_gpus = 0 # `int` means the ID of single GPU, -1 means CPU
Expand All @@ -58,15 +65,14 @@ def __init__(self, agent_class=None, env=None, env_func=None, env_args: dict = N
self.cwd = None # current working directory to save model. None means set automatically
self.if_remove = True # remove the cwd folder? (True, False, None:ask me)
self.break_step = +np.inf # break training if 'total_step > break_step'
self.if_over_write = False # overwrite the best policy network (actor.pth)
self.if_over_write = False # over write the best policy network (actor.pth)
self.if_allow_break = True # allow break training when reach goal (early termination)

'''Arguments for evaluate'''
self.save_gap = 2 # save the policy network (actor.pth) for learning curve, +np.inf means don't save
self.eval_gap = 2 ** 7 # evaluate the agent per eval_gap seconds
self.tracker_len = 20
self.eval_gap = 1e6 # evaluate the agent per eval_gap seconds
self.reevaluate = False
self.eval_times = 2 ** 4 # number of times that get episode return
self.eval_env_func = None # eval_env = eval_env_func(*eval_env_args)
self.eval_env_args = None # eval_env = eval_env_func(*eval_env_args)

def init_before_training(self):
np.random.seed(self.random_seed)
Expand Down

0 comments on commit ebb9a59

Please sign in to comment.