# Combining Tianshou's PPO with our CorePolicy

This notebook is mostly meant as a test, and a showcase of how we can combine a CorePolicy with policies provided by Tianshou.

In [1]:
import tianshou as ts 
from tianshou.utils import TensorboardLogger

import torch
from torch.utils.tensorboard import SummaryWriter

import os
from datetime import datetime

from environments import Resetting
from networks import NetHackObsNet, GoalNetHackActor, GoalNetHackCritic
from policies import PPOBasedPolicy
from models import SelfModel, EnvModel
from intrinsic import ICM
from core import GoalCollector, GoalVectorReplayBuffer

from nle.env.tasks import NetHackGold

## Setup

In [2]:
env = Resetting(NetHackGold())

In [3]:
num_train_envs = 5
num_test_envs = 5

train_envs = ts.env.DummyVectorEnv([lambda: env for _ in range(num_train_envs)])
test_envs = ts.env.DummyVectorEnv([lambda: env for _ in range(num_test_envs)])

In [4]:
obs_net = NetHackObsNet(env.observation_space)

actor_net = GoalNetHackActor(obs_net, env.action_space)
critic_net = GoalNetHackCritic(obs_net)

In [5]:
train_buf = GoalVectorReplayBuffer(2000, num_train_envs)
test_buf = GoalVectorReplayBuffer(1, num_train_envs)

env_model = EnvModel() # this does nothing, for the moment
self_model = SelfModel(obs_net, env.action_space, ICM, train_buf)

In [6]:
# using a single optimizer for actor and critic simplifies the training loop and is more computationally efficient
# BUT gradient updates in one network will influence the gradient updates in the other, and this might create unexpected problems...
combined_params = set(list(actor_net.parameters()) + list(critic_net.parameters()))
optimizer = torch.optim.Adam(combined_params, lr=3e-4)

## Custom PPO-based policy

In [7]:
policy = PPOBasedPolicy(
    self_model=self_model,
    env_model=env_model,
    act_net=actor_net, 
    critic_net=critic_net, 
    optim=optimizer,
    action_space=env.action_space,
    observation_space=env.observation_space,
)

In [8]:
train_collector = GoalCollector(policy, train_envs, train_buf)

test_collector = GoalCollector(policy, test_envs, test_buf)

In [9]:
num_epochs = 5
num_steps_per_epoch = 100

step_per_collect = 10
episode_per_test = 6
batch_size = 10

timestamp = datetime.now().strftime("%d%m%Y-%H%M%S")
log_path = os.path.join("../logs", "ppo_based", timestamp)
writer = SummaryWriter(log_path)
logger = TensorboardLogger(writer)

In [10]:
trainer = ts.trainer.OnpolicyTrainer(
    policy=policy, 
    train_collector=train_collector, 
    test_collector=test_collector,
    repeat_per_collect=1,
    max_epoch=num_epochs,
    step_per_epoch=num_steps_per_epoch,
    step_per_collect=step_per_collect,
    episode_per_test=episode_per_test,
    batch_size=batch_size,
    logger=logger,
)

In [11]:
# TODO test that the gradient flows as expected through the various networks!! 
for epoch_stats in trainer:
    # TODO a more informative print, plots, logging, etc.
    print(epoch_stats)

Epoch #1: 101it [00:01, 63.77it/s, env_step=100, gradient_step=10, len=0, n/ep=0, n/st=10, rew=0.00]                         


Epoch #1: test_reward: -3.873333 ± 2.661645, best_reward: -3.873333 ± 2.661645 in #1
EpochStats(epoch=1, train_collect_stat=CollectStats(n_collected_episodes=0, n_collected_steps=10, collect_time=0.03685903549194336, collect_speed=271.30389785120116, returns=array([], dtype=float64), returns_stat=None, lens=array([], dtype=int64), lens_stat=None), test_collect_stat=CollectStats(n_collected_episodes=6, n_collected_steps=4130, collect_time=23.11383295059204, collect_speed=178.68087949014156, returns=array([-1.25, -1.89, -1.  , -5.05, -5.98, -8.07]), returns_stat=SequenceSummaryStats(mean=-3.873333333333281, std=2.6616452722996784, max=-1.0000000000000007, min=-8.069999999999828), lens=array([ 163,  247,  157,  730,  848, 1985]), lens_stat=SequenceSummaryStats(mean=688.3333333333334, std=640.5257389225059, max=1985.0, min=157.0)), training_stat=PPOTrainingStats(train_time=0.10531091690063477, smoothed_loss={'loss': 0.6283079218119383, 'clip_loss': 6.765127271712857e-08, 'vf_loss': 1.31908

Epoch #2: 101it [00:01, 69.43it/s, env_step=200, gradient_step=20, len=0, n/ep=0, n/st=10, rew=0.00]                         


KeyboardInterrupt: 

Note how this notebook is almost exactly the same as `ppo_baseline.ipynb`!