In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from complex_envs.reacher import FixedReacherEnv

import matplotlib.pyplot as plt
import torch.nn as nn
import numpy as np
import pickle
import os

from lyapunov_reachability.speculation_ddpg import DefaultDDPG, ExplorerDDPG, DefaultBCQ, ExplorerBCQ, LyapunovDDPG, LyapExpDDPG
from lyapunov_reachability.common.networks import Mlp, Cnn

# Reacher

In [None]:
episode_length = 300
confidence = 0.8
batch_size = 256
gamma = 1. - 1e-3
strict_done = True
n = 11
grid_points = 21
replay_size = int(4e5)

env = FixedReacherEnv()
name = '{}-reacher'.format(int(episode_length))

In [None]:
steps = int(2e6)
log_interval = int(1e4)
save_interval= int(1e5)

In [None]:
# Without baseline,
# baseline_dir = None
# baseline_steps = None

# ----
baseline_dir = os.path.join(name, 'ddpg-initial')
baseline_step = int(0)

In [None]:
seeds = list(range(1001, 1011))

### Default DDPG

In [None]:
for seed in seeds:
    env.seed(seed)
    bl = DefaultDDPG(
        env, confidence, extractor=Mlp, extractor_params={'channels_': [400, 300], 'activ': 'relu'}, seed=seed, lr=1e-4,
        batch_size=batch_size, gamma=gamma, grid_points=grid_points, strict_done=strict_done, gradient_clip=(5e3, 1e4), lr_ratio=.1,
        noise_theta=0.1, noise_sigma=0.05, 
        replay_size=replay_size, replay_prioritized=False, replay_double=True, double=True,
        baseline_dir=baseline_dir, baseline_step=baseline_step, save_dir=os.path.join(name, 'spec-def-ddpg-{}'.format(seed)))
#     bl = DefaultBCQ(
#         env, confidence, extractor=Mlp, extractor_params={'channels_': [256, 256], 'activ': 'relu'}, seed=seed, lr=1e-3, gamma=gamma,
#         grid_points=grid_points, strict_done=strict_done, act_samples=10, baseline_dir=baseline_dir, baseline_step=baseline_step,
#         save_dir=os.path.join(name, 'spec-bl-bcq-{}'.format(seed)))
#     bl = DefaultBCQ.load(os.path.join(name, 'spec-bl-bcq-{}'.format(seed)), 0, env=env)
    bl.run(steps, episode_length, log_interval=log_interval, save_interval=save_interval,)
    del bl

### Lyapunov DDPG

In [None]:
for seed in seeds:
    env.seed(seed)
    model = LyapunovDDPG(
        env, confidence, extractor=Mlp, extractor_params={'channels_': [400, 300], 'activ': 'relu'}, seed=seed, lr=1e-4,
        batch_size=batch_size, gamma=gamma, grid_points=grid_points, strict_done=strict_done, gradient_clip=(5e3, 1e4),
        lr_ratio={'actor': 0.1, 'log_lambda':0.01}, target_ratio=0., noise_theta=0.1, noise_sigma=0.05, 
        replay_size=replay_size, replay_prioritized=False, replay_double=True,
        baseline_dir=baseline_dir, baseline_step=baseline_step, save_dir=os.path.join(name, 'spec-lyap-ddpg-{}'.format(seed)))
#     model = LyapunovDDPG.load(os.path.join(name, 'spec-bl-bcq-{}'.format(seed)), 0, env=env)
    extra_args = {'eval_interval': None, 'eval_trials': 100}
    model.run(steps, episode_length, log_interval=log_interval, save_interval=save_interval, **extra_args)
    del model

### Exploratory, or Lyapunov+Exploratory DDPG

In [None]:
# for seed in seeds:
#     env.seed(seed)
#     bl = LyapExpDDPG(
#         env, confidence, extractor=Mlp, extractor_params={'channels_': [400, 300], 'activ': 'relu'}, seed=seed, lr=1e-4,
#         batch_size=batch_size, gamma=gamma, grid_points=grid_points, strict_done=strict_done, replay_size=replay_size, replay_prioritized=False,
#         target_ratio=0., noise_theta=0., noise_sigma=0., gradient_clip=(5e3, 1e4), lr_ratio=.5, safe_decay=1e-6,
#         baseline_dir=baseline_dir, baseline_step=baseline_step, save_dir=os.path.join(name, 'spec-lyapexp-ddpg-{}'.format(seed)))
#     extra_args = {'eval_interval': 10000, 'eval_trials': 100}
#     bl.run(steps, episode_length, log_interval=log_interval, save_interval=save_interval, **extra_args)
#     del bl

In [None]:
for seed in seeds:
    env.seed(seed)
    model = ExplorerDDPG(
        env, confidence, extractor=Mlp, extractor_params={'channels_': [400, 300], 'activ': 'relu'}, seed=seed, lr=1e-4,
        batch_size=batch_size, gamma=gamma, grid_points=grid_points, strict_done=strict_done, gradient_clip=(5e3, 1e4),
        lr_ratio={'actor': 0.1, 'expl_actor': 0.1, 'expl_log_lambda':0.01}, noise_theta=0.1, noise_sigma=0.05, 
        replay_size=replay_size, replay_prioritized=False, replay_double=True, target_ratio=0., safe_decay=1e-5,
        baseline_dir=baseline_dir, baseline_step=baseline_step, save_dir=os.path.join(name, 'spec-exp-ddpg-{}'.format(seed)))
#     model = ExplorerBCQ(
#         env, confidence, extractor=Mlp, extractor_params={'channels_': [400, 300], 'activ': 'relu'}, seed=seed, lr=3e-4, gamma=gamma,
#         grid_points=grid_points, strict_done=strict_done, act_samples=10, baseline_dir=baseline_dir, baseline_step=baseline_step,
#         save_dir=os.path.join(name, 'safespec-exp-bcq-{}'.format(seed)))
    extra_args = {'eval_interval': None, 'eval_trials': 100}
    model.run(steps, episode_length, log_interval=log_interval, save_interval=save_interval, **extra_args)
    del model

### Auxiliary code blocks for use.

* To create the virtual ''unsafe'' samples,

In [None]:
from lyapunov_reachability.common.replay import Replay

virtual_memory_size = int(replay_size * (1. / confidence - 1.))
virtual_memory = Replay(virtual_memory_size)

for t in range(virtual_memory_size):
    virtual_memory.store((
        env.observation_space.sample(),
        env.action_space.sample(),
        env.observation_space.sample(),
        1.,
        0.,))

with open(os.path.join(baseline_dir, "{}-replay_reached".format(baseline_step)), 'wb') as f:
    pickle.dump(virtual_memory, f, pickle.HIGHEST_PROTOCOL)

* To train the baseline,

In [None]:
# To train baseline,
ddpg = DefaultDDPG(
    env, confidence, extractor=Mlp, extractor_params={'channels_': [400, 300], 'activ': 'relu'}, seed=1234, lr=5e-4,
    gamma=gamma, grid_points=grid_points, strict_done=strict_done, replay_size=replay_size, noise_theta=0., noise_sigma=0.,
    save_dir=os.path.join(name, 'ddpg-initial'))
ddpg.run(baseline_step, episode_length, log_interval=log_interval, save_interval=save_interval,)
del ddpg

* To test the trained model,

In [None]:
seed = 9999
model =\
    LyapExpDDPG.load(os.path.join(name, 'spec-lyapexp-ddpg-{}'.format(seed)), steps, env=env)
    #DefaultDDPG.load(os.path.join(name, 'spec-def-ddpg-{}'.format(seed)), steps, env=env)
    #LyapunovDDPG.load(os.path.join(name, 'spec-lyap-ddpg-{}'.format(seed)), steps, env=env)

In [None]:
from cv2 import resize
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display
from gridworld.utils import test, play, visualize
frame_size = (150, 100)
anim_name = os.path.join(name, 'trial.gif')

In [None]:
obs, done = env.reset(), False

episode_rew = 0
episode_safety = 1.
frames = []
t = 0

while not done:
    if t > episode_length or episode_safety == 0.:
        break
    
    # Create image
    frame = env.render(mode='rgb_array')
    frames.append(resize(frame, dsize=frame_size,))

    # Do step
    obs, rew, done, info = env.step(model.step(obs))
    episode_safety = episode_safety * info['safety']
    episode_rew = gamma * episode_rew + rew
    t += 1

print("Total runtime: %.4f" % t)
print("Total reward: %.4f" % episode_rew)
print("Total safety: %.4f" % episode_safety)
env.close()

plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72)
patch = plt.imshow(frames[0])
plt.axis('off')

def animate(i):
    patch.set_data(frames[i])

anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50)
anim.save(anim_name, dpi=80, writer='imagemagick')
display(display_animation(anim, default_mode='loop'))

* To correct param.pkl,

In [None]:
with open(os.path.join(os.path.join(name, 'spec-*-ddpg-{}'.format(seed)), 'params.pkl'), 'rb') as f:
    data = pickle.load(f)
# data['ob_side'] = data.pop('ob_resize')
data['extractor'] = 'Mlp'
with open(os.path.join(os.path.join(name, 'spec-*-ddpg-{}'.format(seed)), 'params.pkl'), 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)