## Demo 1~3
You can find a simple version in `elegantrl/tutorial/run.py`
You can also find demo 1~3 in `elegantrl/run.py` (formal version)

elegantrl/tutorial <1000 lines 
```
net.py   # 160 lines
agent.py # 530 lines
run.py   # 320 lines
env.py   # 160 lines (not necessary)
```
The structtion of formal version is similar to tutorial version.

In [1]:
from elegantrl.tutorial.run import Arguments, train_and_evaluate
from elegantrl.tutorial.env import PreprocessEnv
import gym
gym.logger.set_level(40)  # Block warning

## Demo 1: Discrete action space

In [2]:
'''choose an DRL algorithm'''
from elegantrl.tutorial.agent import AgentDoubleDQN  # AgentDQN

args = Arguments(agent=None, env=None, gpu_id=None)
args.agent = AgentDoubleDQN()

In [3]:
'''choose environment'''
args.env = PreprocessEnv(env=gym.make('CartPole-v0'))
args.net_dim = 2 ** 7  # change a default hyper-parameters
args.batch_size = 2 ** 7
"TotalStep: 2e3, TargetReward: , UsedTime: 10s"

# args.env = PreprocessEnv(env=gym.make('LunarLander-v2'))
# args.net_dim = 2 ** 8
# args.batch_size = 2 ** 8


| env_name: CartPole-v0, action space if_discrete: True
| state_dim: 4, action_dim: 2, action_max: 1
| max_step: 200 target_reward: 195.0


'TotalStep: 2e3, TargetReward: , UsedTime: 10s'

In [4]:
'''train and evaluate'''
train_and_evaluate(args)

| GPU id: 0, cwd: ./CartPole-v0_0
| Remove history
ID      Step      MaxR |    avgR      stdR       objA      objC
0   0.00e+00    200.00 |
ID      Step   TargetR |    avgR      stdR   UsedTime  ########
0   1.02e+03    195.00 |  200.00      0.00         12  ########


## Demo 2: Continuous action space

In [5]:
'''DEMO 2.1: choose an off-policy DRL algorithm'''
from elegantrl.agent import AgentSAC  # AgentTD3, AgentDDPG
args = Arguments(if_on_policy=False)
args.agent = AgentSAC()

In [6]:
'''DEMO 2.2: choose an on-policy DRL algorithm'''
from elegantrl.tutorial.agent import AgentPPO 
args = Arguments(if_on_policy=True)  # hyper-parameters of on-policy is different from off-policy
args.agent = AgentPPO()

In [7]:
'''choose environment'''
env = gym.make('Pendulum-v0')
env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
args.env = PreprocessEnv(env=env)
args.reward_scale = 2 ** -3  # RewardRange: -1800 < -200 < -50 < 0
args.net_dim = 2 ** 7
args.batch_size = 2 ** 7
"TotalStep: 3e5, TargetReward: -200, UsedTime: 300s"
# args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
# args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302
# "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s"
# args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
# args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
# args.break_step = int(2e5)
# args.if_allow_break = False
# "TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s"


| env_name: Pendulum-v0, action space if_discrete: False
| state_dim: 3, action_dim: 1, action_max: 2.0
| max_step: 200 target_reward: -200


'TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s'

In [8]:
'''train and evaluate'''
train_and_evaluate(args)
# train_and_evaluate__multiprocessing(args)  # try multiprocessing in formal version

| GPU id: 0, cwd: ./Pendulum-v0_0
| Remove history
ID      Step      MaxR |    avgR      stdR       objA      objC
0   0.00e+00  -1520.74 |
0   4.20e+03  -1432.98 |
0   8.40e+03  -1227.07 |
0   1.68e+04  -1185.30 |
0   2.10e+04  -1124.60 |
0   2.94e+04   -940.84 |
0   4.20e+04   -877.70 |
0   5.46e+04   -862.52 |
0   7.14e+04   -746.69 |
0   9.24e+04   -620.06 |
0   9.66e+04   -564.93 |
0   1.09e+05   -559.21 |
0   1.13e+05   -396.94 |
0   1.22e+05   -250.43 |
0   1.39e+05   -116.70 |
ID      Step   TargetR |    avgR      stdR   UsedTime  ########
0   1.43e+05   -200.00 | -116.70    116.00        225  ########


## Demo 3: Custom Env from AI4Finance

In [12]:
args = Arguments(if_on_policy=True)
'''choose an DRL algorithm'''
from elegantrl.tutorial.agent import AgentPPO
args.agent = AgentPPO()

from elegantrl.tutorial.env import FinanceMultiStockEnv  # a standard env for ElegantRL, not need PreprocessEnv()
args.env = FinanceMultiStockEnv(if_train=True)
args.env_eval = FinanceMultiStockEnv(if_train=False)  # eva_len = 1699 - train_len
args.reward_scale = 2 ** 0  # RewardRange: 0 < 1.0 < 1.25 <
args.break_step = int(5e6)
args.max_step = args.env.max_step
args.max_memo = (args.max_step - 1) * 8
args.batch_size = 2 ** 11
"TotalStep:  2e5, TargetReward: 1.25, UsedTime:  200s"

'TotalStep: 10e5, TargetReward: 1.62, UsedTime: 1000s'

In [13]:
'''train and evaluate'''
train_and_evaluate(args)
# args.rollout_num = 8
# train_and_evaluate__multiprocessing(args)  # try multiprocessing in formal version

| GPU id: 0, cwd: ./FinanceStock-v2_0
| Remove history
ID      Step      MaxR |    avgR      stdR       objA      objC
0   0.00e+00      1.06 |
0   5.12e+03      1.10 |
0   1.02e+04      1.22 |
0   2.56e+04      1.29 |
ID      Step   TargetR |    avgR      stdR   UsedTime  ########
0   3.07e+04      1.25 |    1.29      0.03         29  ########


## Demo 4: train in PyBullet (MuJoCo) (wait for adding)

In [None]:
import gym  # don't worry about 'WARN: Box bound precision lowered by casting to float32'
import pybullet_envs  # PyBullet is free, but MuJoCo is paid
from AgentEnv import decorate_env
from AgentRun import Arguments, train_and_evaluate
from AgentZoo import AgentTD3, AgentSAC, AgentPPO

In [None]:
env_name = 'AntBulletEnv-v0'
assert env_name in {
    "AntBulletEnv-v0", 
    "Walker2DBulletEnv-v0", 
    "HalfCheetahBulletEnv-v0",
    "HumanoidBulletEnv-v0", 
    "HumanoidFlagrunBulletEnv-v0", 
    "HumanoidFlagrunHarderBulletEnv-v0",
}
env = gym.make(env_name)
env = decorate_env(env, if_print=True)

In [None]:
args = Arguments()
args.agent_rl = AgentSAC  # AgentSAC can't reach target_reward=2500, try AgentModSAC
args.env = env
args.reward_scale = 2 ** -3
args.break_step = int(1e6 * 8)
args.eval_times = 2

## Demo 5: Atari game (wait for adding)

In [None]:
env_name = 'breakout-v0'  # 'SpaceInvaders-v0'