In [1]:
import gym
import numpy as np
import torch
import torch.nn
import torch.nn.functional as F
from torch.optim import Adam

from models import ActorCritic

In [2]:
import random
from collections import namedtuple

# Taken from
# https://github.com/pytorch/tutorials/blob/master/Reinforcement%20(Q-)Learning%20with%20PyTorch.ipynb

Transition = namedtuple('Transition', 
            ('state', 'action', 'reward', 'next_state')) # mask is unnecessary.

class Memory(object):
    def __init__(self):
        self.memory = []

    def push(self, *args):
        """Saves a transition."""
        self.memory.append(Transition(*args))

    def sample(self):
        return Transition(*zip(*self.memory))
        # Very useful.
        # The trpo is on-policy off-line algorithm.

    def __len__(self):
        return len(self.memory)

In [3]:
def getGAE(reward, value, gamma, tau):
    # (batch_size, shape)
    delta = reward[:, :-1] + gamma * value[:, 1:] - value[:, :-1]
    print("delta = ", delta)
    advantage = reward.clone()
    for i in reversed(range(delta.shape[-1])):
        advantage[0, i] = delta[0, i] + gamma * tau * advantage[0, i+1]    
    return (advantage - advantage.mean())/advantage.std()

In [4]:
def get_flat_params_from(model):
    params = []
    for param in model.parameters():
        params.append(param.data.view(-1))

    flat_params = torch.cat(params)
    return flat_params

def set_flat_params_to(model, flat_params):
    prev_ind = 0
    for param in model.parameters():
        flat_size = param.numel()
        param.data.copy_(
            flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
        prev_ind += flat_size

In [5]:
class trpo(object):
    def __init__(self, 
                num_inputs,
                action_space,
                gamma,
                tau,
                damping,
                delta,
                cuda,
                hidden_size,
                lr=0.003):
        self.gamma, self.tau, self.damping, self.delta = gamma, tau, damping, delta
        self.device = torch.device("cuda" if cuda else "cpu")
        self.actor_critic = ActorCritic(num_inputs, action_space, hidden_size).to(self.device)
        self.critic_optim = Adam(self.actor_critic.parameters(), lr)

    def get_Hx(self, loss, parameters, damping):
        def func(x):
            grads = torch.autograd.grad(loss, parameters, create_graph=True)
            flat_grad = torch.cat([grad.view(-1) for grad in grads])
            grad_grads = torch.autograd.grad(flat_grad @ x, parameters)
            flat_grad_grad = torch.cat([grad_grad.contiguous().view(-1) for grad_grad in grad_grads]).data
            return flat_grad_grad + x * damping
        return func

    def get_loss(self, advantage, logp_a, logp_a_prev):
        return -(advantage * torch.exp(logp_a - logp_a_prev)).mean()
    
    def select_action(self, state):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        action,_, _, _, _, _ = self.actor_critic(state)
        return action.detach().cpu().numpy()[0]

    def cg(self, A, b, iters, acc=1e-10):
        x = torch.zeros_like(b)
        d = b.clone()
        g = -b
        g_dot_g_old = 1
        for _ in range(iters):
            g_dot_g = np.dot(g, g)
            d = -g + g_dot_g / g_dot_g_old * d
            alpha = g_dot_g / torch.dot(d, A(d))
            x += alpha * d
            if g_dot_g < acc:
                break
            g_dot_g_old = g_dot_g
            g = A(x) - b
        return x
        # There are a lot of works hidden in conjugate_gradients
    
    def update_parameters(self, memory):
        state, action, reward, next_state = memory.sample()
        state = torch.FloatTensor(state).to(self.device)
        action = torch.FloatTensor(action).to(self.device)
        reward = torch.FloatTensor(reward).to(self.device).unsqueeze(1)
        next_state = torch.FloatTensor(next_state).to(self.device)

        _, logp_a, _, pi, d_kl, value = self.actor_critic(state, action)

        advantage = getGAE(reward, value, self.gamma, self.tau)
        loss = self.get_loss(advantage, logp_a, logp_a.clone().detach())
        grads = torch.autograd.grad(loss, self.actor_critic.actor.parameters())
        loss_grad = torch.cat([grad.view(-1) for grad in grads]).data

        invHg = self.cg(self.get_Hx(d_kl, self.actor_critic.actor.parameters(), self.damping), loss_grad, 10)
        lm = torch.sqrt(0.5 * loss_grad @ invHg / self.delta)
        fullstep = invHg / lm

        prev_params = get_flat_params_from(self.actor_critic.actor)
        # Line search:
        alpha = 1
        for _ in range(10):
            alpha = 0.5 * alpha
            new_params = prev_params + alpha * fullstep
            set_flat_params_to(self.actor_critic.actor, new_params)
            _, logp_a_new, _, _, d_kl, _ = self.actor_critic(state, action, pi)
            loss_new = get_loss(advantage, logp_a_new, logp_a.clone().detach())
            if loss_new < loss and d_kl < self.delta:
                break

        target_value = reward.clone()
        target_value[:, :-1] += self.gamma * value[:, 1:] - value[:, :-1] 
        value_loss = F.mse(value, target_value)
        self.critic_optim.zero_grad()
        value_loss.backward()
        self.critic_optim.step()
        

In [6]:
args = {'gamma':0.995,
        'tau': 0.99,
        'damping':0.1,
        'delta':0.02,
        'cuda':False,
        'hidden_size':(64,),
        'lr':0.003}

In [7]:
env_name = 'HalfCheetah-v2'
seed = 0

env = gym.make(env_name)
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
env.seed(seed)
torch.manual_seed(seed)

agent = trpo(env.observation_space.shape[0], env.action_space, **args)

In [8]:
for epoch in range(10):
    memory = Memory()
    state = env.reset()
    reward_sum = 0
    done = False
    while not done or len(memory) < 1000:
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        memory.push(state, action, reward, next_state)
        reward_sum += reward
        state = next_state
    agent.update_parameters(memory)
    print("epoch {}: reward {:.2f}".format(epoch, reward_sum))

delta =  tensor([], size=(1000, 0), grad_fn=<SubBackward0>)


RuntimeError: There were no tensor arguments to this function (e.g., you passed an empty list of Tensors), but no fallback function is registered for schema aten::_cat.  This usually means that this function requires a non-empty list of Tensors.  Available functions are [CUDATensorId, CPUTensorId, VariableTensorId]