## Demo 1: Discrete action space

In [34]:
# algorithm
from AgentZoo import AgentDoubleDQN

In [35]:
# training
from AgentRun import Arguments, train_and_evaluate

In [36]:
# environment
from Env import decorate_env
import gym  # gym of OpenAI is not necessary for ElegantRL (even RL)
gym.logger.set_level(40)  # Block warning: 'WARN: Box bound precision lowered by casting to float32'

env = gym.make('CartPole-v0')
env = decorate_env(env)

| env_name: CartPole-v0, action space: if_discrete
| state_dim: 4, action_dim: 2, action_max: 1, target_reward: 195.0


In [38]:
# class for setting and hyper-parameters
args = Arguments(agent_rl=AgentDoubleDQN, env=env, gpu_id=0)

# set hyper-parameters
args.break_step = int(1e3 * 8)  # UsedTime: 20s (reach target_reward 195)
args.net_dim = 2 ** 7

# train the agent
train_and_evaluate(args) 

| GPU id: 0, cwd: ./AgentDoubleDQN/CartPole-v0_0
| Remove history
ID      Step      MaxR |    avgR      stdR       objA      objC
0   0.00e+00    200.00 |
ID      Step   TargetR |    avgR      stdR   UsedTime  ########
0   1.02e+03    195.00 |  200.00      0.00         13  ########


In [39]:
# train the agent in another env
args.env = decorate_env(env=gym.make('LunarLander-v2'))
args.net_dim = 2 ** 8  # change a default hyper-parameters
args.break_step = int(1e5 * 8)  # UsedTime: 200s (reach target_reward 200)

train_and_evaluate(args) 

| env_name: LunarLander-v2, action space: if_discrete
| state_dim: 8, action_dim: 4, action_max: 1, target_reward: 200
| GPU id: 0, cwd: ./AgentDoubleDQN/CartPole-v0_0
| Remove history
ID      Step      MaxR |    avgR      stdR       objA      objC
0   0.00e+00   -229.44 |
0   1.02e+03   -157.52 |
0   5.12e+03   -114.14 |
0   6.14e+03    -51.93 |
0   8.19e+03    106.39 |
0   1.33e+04    215.38 |
ID      Step   TargetR |    avgR      stdR   UsedTime  ########
0   1.43e+04    200.00 |  215.38    103.27        194  ########


## Demo 2: Continuous action space

In [40]:
import gym  # don't worry about 'WARN: Box bound precision lowered by casting to float32'
from AgentRun import Arguments, train_and_evaluate
from AgentZoo import AgentTD3, AgentSAC, AgentPPO

### Demo 2.1: Off-policy TD3 and SAC

In [47]:
args = Arguments(if_on_policy=False)  # if_on_policy=False in default

# choose DRL algorithm (off-policy)
args.agent_rl = AgentSAC  # AgentTD3

env = gym.make('Pendulum-v0')
env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
args.env = decorate_env(env)
args.reward_scale = 2 ** -3
# args.env = decorate_env(env=gym.make('LunarLanderContinuous-v2'))
# args.reward_scale = 2 ** -2
# args.env = decorate_env(env=gym.make('BipedalWalker-v3'))  # recommend args.gamma = 0.95
# args.gamma = 0.96

train_and_evaluate(args) 

| env.spec.reward_threshold is None, so I set target_reward=-200
| env_name: Pendulum-v0, action space: Continuous
| state_dim: 3, action_dim: 1, action_max: 2.0, target_reward: -200
| GPU id: j, cwd: ./AgentSAC/Pendulum-v0_j
| Remove history
ID      Step      MaxR |    avgR      stdR       objA      objC
j   0.00e+00  -1340.63 |
j   1.02e+03   -970.71 |
j   4.10e+03   -817.50 |
j   5.12e+03   -357.35 |
j   6.14e+03   -259.11 |
j   7.17e+03   -228.45 |
j   8.19e+03   -158.32 |
ID      Step   TargetR |    avgR      stdR   UsedTime  ########
j   9.22e+03   -200.00 | -158.32    131.10        155  ########


### Demo 2.2: On-policy PPO

In [48]:
args = Arguments(if_on_policy=True)  # on-policy has different hyper-parameters from off-policy

# choose DRL algorithm (off-policy)
args.agent_rl = AgentPPO

env = gym.make('Pendulum-v0')
env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
args.env = decorate_env(env)
args.reward_scale = 2 ** -3
# args.env = decorate_env(env=gym.make('LunarLanderContinuous-v2'))
# args.reward_scale = 2 ** -2
# args.env = decorate_env(env=gym.make('BipedalWalker-v3'))  # recommend args.gamma = 0.95
# args.gamma = 0.96

train_and_evaluate(args)

| env.spec.reward_threshold is None, so I set target_reward=-200
| env_name: Pendulum-v0, action space: Continuous
| state_dim: 3, action_dim: 1, action_max: 2.0, target_reward: -200
| GPU id: j, cwd: ./AgentPPO/Pendulum-v0_j
| Remove history
ID      Step      MaxR |    avgR      stdR       objA      objC
j   0.00e+00  -1375.25 |
j   9.60e+03  -1216.81 |
j   1.28e+04  -1196.24 |
j   1.60e+04  -1129.99 |
j   4.80e+04  -1051.96 |
j   8.32e+04   -999.29 |
j   8.64e+04   -876.76 |
j   9.92e+04   -797.66 |
j   1.06e+05   -737.22 |
j   1.12e+05   -732.56 |
j   1.15e+05   -673.28 |
j   1.22e+05   -649.79 |
j   1.28e+05   -606.30 |
j   1.73e+05   -562.12 |
j   1.76e+05   -559.34 |
j   1.79e+05   -552.79 |
j   1.82e+05   -468.47 |
j   1.86e+05   -401.66 |
j   1.89e+05   -153.41 |
ID      Step   TargetR |    avgR      stdR   UsedTime  ########
j   1.92e+05   -200.00 | -153.41    101.49        228  ########


## Demo 3: FinanceMultiStockEnv from FinRL

In [49]:
from AgentEnv import FinanceMultiStockEnv
from AgentRun import Arguments, train_and_evaluate
from AgentZoo import AgentTD3, AgentSAC, AgentPPO

In [50]:
args = Arguments(if_on_policy=True)
args.agent_rl = AgentPPO
args.env = FinanceMultiStockEnv()  # a standard env for ElegantRL, not need decorate_env()
args.env.target_reward = 10

args.break_step = int(5e6 * 4) 
args.net_dim = 2 ** 8
args.max_step = 1699
args.max_memo = (args.max_step - 1) * 16
args.batch_size = 2 ** 11
args.repeat_times = 2 ** 4

In [None]:
train_and_evaluate(args)

| GPU id: j, cwd: ./AgentPPO/FinanceStock-v1_j
| Remove history
ID      Step      MaxR |    avgR      stdR       objA      objC
j   0.00e+00      1.96 |
j   2.55e+04      2.08 |
j   5.09e+04      2.16 |
j   7.64e+04      2.26 |
j   1.02e+05      2.36 |
j   1.27e+05      2.48 |


## Demo 4: PyBullet (MuJoCo) (wait for adding)

In [35]:
import gym  # don't worry about 'WARN: Box bound precision lowered by casting to float32'
import pybullet_envs  # PyBullet is free, but MuJoCo is paid
from AgentRun import Arguments, train_and_evaluate
from AgentZoo import AgentTD3, AgentSAC, AgentPPO

In [36]:
env_name = 'AntBulletEnv-v0'
assert env_name in {
    "AntBulletEnv-v0", 
    "Walker2DBulletEnv-v0", 
    "HalfCheetahBulletEnv-v0",
    "HumanoidBulletEnv-v0", 
    "HumanoidFlagrunBulletEnv-v0", 
    "HumanoidFlagrunHarderBulletEnv-v0",
}
env = gym.make(env_name)
env = decorate_env(env, if_print=True)

In [None]:
args = Arguments()
args.agnent_rl = AgentSAC

In [None]:
train_and_evaluate(args)

## Demo 5: Atari game (wait for adding)

In [None]:
env_name = 'breakout-v0'  # 'SpaceInvaders-v0'