In [1]:
import gym
import numpy as np
import torch
import torch.nn
import torch.nn.functional as F
from torch.optim import Adam

from trpo import trpo

ModuleNotFoundError: No module named 'gym'

In [0]:
import random
from collections import namedtuple

# Taken from
# https://github.com/pytorch/tutorials/blob/master/Reinforcement%20(Q-)Learning%20with%20PyTorch.ipynb

Transition = namedtuple('Transition', 
            ('state', 'action', 'reward', 'next_state')) # mask is unnecessary.

class Memory(object):
    def __init__(self):
        self.memory = []

    def push(self, *args):
        """Saves a transition."""
        self.memory.append(Transition(*args))

    def sample(self):
        return Transition(*zip(*self.memory))
        # Very useful.
        # The trpo is on-policy off-line algorithm.

    def __len__(self):
        return len(self.memory)

In [0]:
Args = namedtuple('Args',
        ('gamma', 'tau', 'damping', 'delta', 
        'device', 'cuda', 'hidden_size', 'lr'))
trpo_args = Args((0.995, 0.99, 0.1, 0.02,
        True, (64,), 0.1))

In [0]:
env_name = 'HalfCheetah-v2'
seed = 0

env = gym.make(env_name)
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
env.seed(seed)
torch.manual_seed(seed)

agent = trpo(env.observation_space.shape[0], env.action_space, trpo_args)

In [0]:
for epoch in range(10):
    memory = Memory()
    state = env.reset()
    reward_sum = 0
    done = False
    while not done or len(memory) < 1000:
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        memory.push(state, action, reward, next_state)
        reward_sum += reward
        state = next_state
    agent.update_parameters(memory)
    print("epoch {}: reward {:.2f}".format(epoch, reward_sum))