In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import random
import pdb
import torch
from torch.optim import Adam
import gym
import time
import wandb
from spinup import models
core = models

In [None]:
from spinup.utils.mpi_pytorch import setup_pytorch_for_mpi, sync_params, mpi_avg_grads
from spinup.utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
from spinup.algos.ppo.ppo import ppo
from spinup.algos.sac.sac import sac

# Pick an Environment
CartPole, the MNIST of RL, discrete action, parameterized state

Breakout, discrete action, 0 fire, 1 stay, 2 right, 3 left
    Notice that breakout-ram is harder than visual input because the state is 128 bytes from the ram
    and the bytes do not have an intuitive meaning

MountainCar, discrete or continous action, parameterized state
Gets reward for climbing up a hill that costs energy,
painful exploration is essential

Walker, peanlty -100 for falling. 
The initial greedy strategy may make the agent stand unmoved and prevent falling
As a result, intial test reward =0 while initial train reward=100

For environments with a large penalty, we should use a large batch when updating Q, in order to compensate the variance

In [None]:
env_name="Breakout-ram-v0" # continous action
#env_name="Breakout-ram-v0" # discrete action
#env_name="CartPole-v1" #discrete action

# Run
    DQN, tested on CartPole
    PPO, tested on Breakout
    SAC-continous, run on Walker, alpha = 0.2 is too large, 0.05 still large
    SAC-discrete, tested on CartPole, run on Breakout
        1. must use eps to prevent nan because probablity for some action becomes 0, this happens when Q is large (e.g. a few hundred)
        2. If n_update or lr is much too high, entropy may collapse for a reasonable alpha, with a stable high policy regret
        3. Sometimes Q stays much lower than ground truth, and learns very slow. I believe there is a trick I need to incorporate
        4. when alpha is too large, maximum entropy, test significantly superior than train. when too small, zero entropy. 
        A heuristic: alpha leads to an additional reward about alpha*entropy, which should be smaller than the reward per step.
        5. on Breakout, 3 happens for Q, pi learns never firing the ball!? randomly picking an action from the other three...

In general, when an algo does not work, try large batch low lr with few updates

## DQN

In [None]:
args = Config()
args.env="Breakout-ram-v0" #discrete action
args.algorithm="dqn"
args.name=f"{args.env}_{args.algorithm}"
args.gpu=1
args.seed=0

algo_args = Config()
algo_args.n_step=4096
algo_args.max_ep_len=2000
algo_args.n_update=100
algo_args.batch_size=512
algo_args.epochs=9999
algo_args.start_steps=5000
algo_args.update_after=5000

model_args=Config()
model_args.hidden_sizes=[256]*4
model_args.activation=torch.nn.ReLU
model_args = Config()
model_args.gamma=0.99
model_args.polyak=0.995
model_args.pi_lr=1e-5
model_args.q_lr=1e-4
model_args.alpha=0
model_args.eps=1e-2
model_args.dqn=True

args.algo_args = algo_args.toDict()
args.model_args = model_args.toDict()

run=wandb.init(
    project="RL",
    config=args,
    name=args.name,
    group=args.env,
)
logger = Logger(run)
env = gym.make(args.env)
model = core.MLPDQActorCritic(env.observation_space, env.action_space, logger=logger, **(model_args.toDict()))
device=args.gpu
model.to(device)
result =sac(lambda : gym.make(args.env), model=model, logger=logger,  device=device, **(algo_args.toDict()))
run.finish()

## SAC

In [None]:
args = Config()
args.env="Breakout-ram-v0" #discrete action
args.algorithm="sac"
args.name=f"{args.env}_{args.algorithm}"
args.gpu=1
args.seed=0

algo_args = Config()
algo_args.n_step=4096
algo_args.max_ep_len=500
algo_args.n_update=30
algo_args.batch_size=512
algo_args.epochs=9999
algo_args.start_steps=20000
algo_args.update_after=20000

model_args=Config()
model_args.hidden_sizes=[256]*4
model_args.activation=torch.nn.ReLU
model_args = Config()
model_args.gamma=0.99
model_args.polyak=0.995
model_args.pi_lr=1e-5
model_args.q_lr=1e-4
model_args.alpha=0.1
model_args.eps=1e-5
model_args.dqn=False

args.algo_args = algo_args.toDict()
args.model_args = model_args.toDict()

run=wandb.init(
    project="RL",
    config=args,
    name=args.name,
    group=args.env,
)
logger = Logger(run)
env = gym.make(args.env)
model = core.MLPDQActorCritic(env.observation_space, env.action_space, logger=logger, **(model_args.toDict()))
device=args.gpu
model.to(device)
result =sac(lambda : gym.make(args.env), model=model, logger=logger,  device=device, **(algo_args.toDict()))
run.finish()

## PPO

In [None]:
args = Config()
args.env=env_name #discrete action
args.algorithm="ppo"
args.name=f"{args.env}_{args.algorithm}"
args.gpu=0
args.seed=0
args.cpu=4
args.steps_per_epoch=5000
args.epochs=500

model_args=Config()
model_args.hidden_sizes=[256]*4
model_args.activation=torch.nn.ReLU
model_args = Config()
model_args.gamma=0.99
model_args.polyak=0.995
model_args.lr=3e-5
model_args.alpha=0
model_args.eps=0.01
model_args.dqn=True
args.model_args = model_args.toDict()


#mpi_fork(args.cpu)  # run parallel code with mpi
run=wandb.init(
    project="RL",
    config=args,
    name=args.name,
    group=args.env,
)
logger = Logger(run)
result = ppo(lambda : gym.make(args.env), actor_critic=core.MLPVActorCritic,
    ac_kwargs=dict(hidden_sizes=(args.hid,)*args.l, logger=logger,  gamma=args.gamma, 
        seed=args.seed, steps_per_epoch=args.steps_per_epoch, epochs=args.epochs)
run.finish()

# Test

In [None]:
import gym

env = gym.make(env_name)
state = env.reset()

total = 0
for _ in range(2000):
    tmp = torch.tensor(state).float()
    action = model.act(tmp)
   # action = env.action_space.sample()
    state, reward, done, info  = env.step(action)
    total += reward
    if done:
        print(f"episode len {_}, reward {total}")
        break

# Visualization

In [None]:
import gym
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

env = gym.make(env_name)
state = env.reset()
img = plt.imshow(env.render(mode='rgb_array')) # only call this once
state, reward, done, info  = env.step(1)
total = 0
for _ in range(2000):
    img.set_data(env.render(mode='rgb_array')) # just update the data
    display.display(plt.gcf())
    display.clear_output(wait=True)
    tmp = torch.as_tensor(state,  dtype=torch.float).to(device)
    action = model.act(tmp, deterministic=False)
   # action = env.action_space.sample()
    state, reward, done, info  = env.step(action)
    total += reward
    if done:
        print(f"episode len {_}, reward {total}")
        break

## Human Control

In [None]:
import gym
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

env = gym.make(env_name)
state = env.reset()
img = plt.imshow(env.render(mode='rgb_array')) # only call this once
state, reward, done, info  = env.step(1)
total = 0
for _ in range(2000):
    img.set_data(env.render(mode='rgb_array')) # just update the data
    display.display(plt.gcf())
    display.clear_output(wait=True)
    tmp = input()
    if len(tmp) == 0:
        tmp = "0"
    action = int(tmp)
    state, reward, done, info  = env.step(action)
    total += reward
    print(f"this: {reward}, total: {total}")
    if done:
        print(f"episode len {_}, reward {total}")
        break

In [31]:
x = env.observation_space

In [35]:
vars(env.action_space)

{'n': 4,
 'shape': (),
 'dtype': dtype('int64'),
 'np_random': RandomState(MT19937) at 0x7FCA30899990}

In [37]:
torch.nn.Conv2d()

AttributeError: module 'torch.nn' has no attribute 'Convolution'