In [6]:
import gym
import numpy as np
import torch
import torch.nn
import torch.nn.functional as F
from torch.optim import Adam

from trpo import trpo

In [7]:
import random
from collections import namedtuple

# Taken from
# https://github.com/pytorch/tutorials/blob/master/Reinforcement%20(Q-)Learning%20with%20PyTorch.ipynb

Transition = namedtuple('Transition', 
            ('state', 'action', 'reward', 'next_state', 'mask'))
class Memory(object):
    def __init__(self):
        self.memory = []

    def push(self, *args):
        """Saves a transition."""
        self.memory.append(Transition(*args))

    def sample(self):
        return Transition(*zip(*self.memory))
        # Very useful.
        # The trpo is on-policy off-line algorithm.

    def __len__(self):
        return len(self.memory)

In [8]:
Args = namedtuple('Args',
        ('gamma', 'tau', 'damping', 'delta', 'cuda', 'hidden_size', 'lr'))
trpo_args = Args(0.995, 1.0, 0.1, 0.02, True, (64,), 0.1)

In [9]:
env_name = 'HalfCheetah-v2'
seed = 12345

env = gym.make(env_name)
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
env.seed(seed)
torch.manual_seed(seed)

agent = trpo(env.observation_space.shape[0], env.action_space, trpo_args)

In [10]:
for epoch in range(1, 2):
    memory = Memory()
    state = env.reset()
    reward_sum = 0
    done = False
    while not done or len(memory) < 1000:
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        mask = 0.0 if done else 1.0
        memory.push(state, action, reward, next_state, mask)
        reward_sum += reward
        state = next_state
    agent.update_parameters(memory)
    print("epoch {}: reward {:.2f}".format(epoch, reward_sum))

tensor([ 6.7304e+27, -5.7420e+28, -1.4948e+28,  ..., -4.7447e+15,
         1.1106e+14,  8.9008e+15], device='cuda:0')
tensor([ 1.2500e+13, -9.5794e+13, -6.2780e+12,  ..., -2.0011e+13,
        -1.0172e+10,  6.7982e+13], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, nan,  ..., nan, nan, nan], device='cuda:0')
tensor([nan, nan, na