In [1]:
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
    !bash ../xvfb start
    os.environ['DISPLAY'] = ':1'

In [2]:
import gym
import scipy.optimize

from models import *
from replay_memory import Memory
from running_state import ZFilter
from trpo import trpo_step
from utils import *

from tqdm import tqdm
import matplotlib.pyplot as plt

torch.utils.backcompat.broadcast_warning.enabled = True
torch.utils.backcompat.keepdim_warning.enabled = True

torch.set_default_tensor_type('torch.DoubleTensor')

In [3]:
def get_trpo_tar_mean_adv(rewards, masks, actions, values):
    returns = torch.Tensor(actions.size(0),1)
    deltas = torch.Tensor(actions.size(0),1)
    advantages = torch.Tensor(actions.size(0),1)

    prev_return = 0
    prev_value = 0
    prev_advantage = 0
    for i in reversed(range(rewards.size(0))):
        returns[i] = rewards[i] + gamma * prev_return * masks[i]
        deltas[i] = rewards[i] + gamma * prev_value * masks[i] - values.data[i]
        advantages[i] = deltas[i] + gamma * tau * prev_advantage * masks[i]

        prev_return = returns[i, 0]
        prev_value = values.data[i, 0]
        prev_advantage = advantages[i, 0]

    targets = returns
    advantages = (advantages - advantages.mean()) / advantages.std()

    return targets, advantages

def get_atrpo_tar_mean_adv(rewards, masks, actions, values):
    ro = torch.mean(rewards)

    deltas_ = torch.Tensor(actions.size(0),1)
    advantages = torch.Tensor(actions.size(0),1)
    targets = torch.Tensor(actions.size(0),1)

    prev_value = 0
    prev_advantage = 0
    for i in reversed(range(rewards.size(0))):
        targets[i] = rewards[i] - ro + prev_value * masks[i]
        deltas_[i] = rewards[i] - ro + prev_value * masks[i] - values.data[i]
        advantages[i] = deltas_[i] + tau * prev_advantage * masks[i]

        prev_value = values.data[i, 0]
        prev_advantage = advantages[i, 0]

    advantages = (advantages - advantages.mean()) / advantages.std()

    return targets, advantages

def get_atrpo_tar_no_mean_adv(rewards, masks, actions, values):
    ro = torch.mean(rewards)

    deltas_ = torch.Tensor(actions.size(0),1)
    advantages = torch.Tensor(actions.size(0),1)
    targets = torch.Tensor(actions.size(0),1)

    prev_value = 0
    prev_advantage = 0
    for i in reversed(range(rewards.size(0))):
        targets[i] = rewards[i] - ro + prev_value * masks[i]
        deltas_[i] = rewards[i] - ro + prev_value * masks[i] - values.data[i]
        advantages[i] = deltas_[i] + tau * prev_advantage * masks[i]

        prev_value = values.data[i, 0]
        prev_advantage = advantages[i, 0]

    # advantages = (advantages - advantages.mean()) / advantages.std()

    return targets, advantages


def get_atrpo_tar_adv_0(rewards, masks, actions, values):
    ro = torch.mean(rewards)

    advantages = torch.Tensor(actions.size(0),1)
    targets = torch.Tensor(actions.size(0),1)

    prev_value = 0
    for i in reversed(range(rewards.size(0))):
        targets[i] = rewards[i] - ro + prev_value * masks[i]
        advantages[i] = rewards[i] - ro + prev_value * masks[i] - values.data[i]

        prev_value = values.data[i, 0]

    return targets, advantages

def get_atrpo_tar_adv_1(rewards, masks, actions, values):
    ro = torch.mean(rewards)

    advantages = torch.Tensor(actions.size(0),1)
    targets = torch.Tensor(actions.size(0),1)

    prev_value = 0
    for i in reversed(range(rewards.size(0))):
        targets[i] = rewards[i] - ro + prev_value
        advantages[i] = rewards[i] - ro + prev_value - values.data[i]

        prev_value = values.data[i, 0]

    return targets, advantages

In [4]:
def update_params(batch, targ_adv_fun):
    rewards = torch.tensor(np.array(batch.reward))
    masks = torch.tensor(np.array(batch.mask))
    actions = torch.Tensor(np.concatenate(batch.action, 0)).detach()
    states = torch.tensor(np.array(batch.state)).detach()
    values = val(states)

    targets, advantages = targ_adv_fun(rewards, masks, actions, values)

    targets = targets.detach()
    advantages = advantages.detach()

    # Original code uses the same LBFGS to optimize the value loss
    def get_value_loss(flat_params):
        set_flat_params_to(val, torch.Tensor(flat_params))
        for param in val.parameters():
            if param.grad is not None:
                param.grad.data.fill_(0)

        value_loss = (val(states) - targets).pow(2).mean()

        # weight decay
        for param in val.parameters():
            value_loss += param.pow(2).sum() * l2_reg
        value_loss.backward()
        return (value_loss.data.double().numpy(), get_flat_grad_from(val).data.double().numpy())

    flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b(get_value_loss, get_flat_params_from(val).double().numpy(), maxiter=25)
    set_flat_params_to(val, torch.Tensor(flat_params))

    action_means, action_log_stds, action_stds = agent(states)
    fixed_log_prob = normal_log_density(actions, action_means, action_log_stds, action_stds).data.clone().detach()

    def get_loss(volatile=False):
        if volatile:
            with torch.no_grad():
                action_means, action_log_stds, action_stds = agent(states)
        else:
            action_means, action_log_stds, action_stds = agent(states)
                
        log_prob = normal_log_density(actions, action_means, action_log_stds, action_stds)
        action_loss = -(advantages * torch.exp(log_prob - fixed_log_prob))
        return action_loss.mean()


    def get_kl():
        mean1, log_std1, std1 = agent(states)

        mean0 = mean1.data.detach()
        log_std0 = log_std1.data.detach()
        std0 = std1.data.detach()
        kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (2.0 * std1.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)

    trpo_step(agent, get_loss, get_kl, max_kl, damping)


In [5]:
env_name = "Humanoid-v4"#"HalfCheetah-v4"
seed = 167
gamma = 0.99
tau = 0.97
l2_reg = 1e-2
max_kl = 1e-2
damping = 1e-1

In [6]:
#get_trpo_tar_mean_adv - trpo with normed advantege
#get_atrpo_tar_mean_adv - atrpo with normed advantege (Bad)
#get_atrpo_tar_adv_0 - atrpo with masks
#get_atrpo_tar_adv_1 - atrpo without masks

render = False
compare_plots = True
plt.figure(figsize=(5,3))
targ_adv_funs = [get_trpo_tar_mean_adv, get_atrpo_tar_no_mean_adv, get_atrpo_tar_adv_0]

ext_its = 3 #num of experiments
i_episodes = 5 # number of batches in an experiment

t_steps = 1000 # path len
batch_size = 15000 # t_steps * (?)
log_interval = max(1,i_episodes//10)

<Figure size 500x300 with 0 Axes>

In [7]:
targ_adv_funs_len = len(targ_adv_funs)
all_acc_batch_rewards = [[] for i in range(len(targ_adv_funs))]

In [8]:
for ext_it in tqdm(range(ext_its * targ_adv_funs_len)):
    cur_exp_id = ext_it // targ_adv_funs_len
    cur_f_id = ext_it % targ_adv_funs_len
    cur_targ_adv_func = targ_adv_funs[cur_f_id]
    cur_seed = seed + cur_exp_id

    env = gym.make(env_name, terminate_when_unhealthy = False)
    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    env.seed(cur_seed)
    torch.manual_seed(cur_seed)
    agent = ATRPOAgent(num_inputs, num_actions)
    val = Value(num_inputs)
    running_state = ZFilter((num_inputs,), clip=5)
    running_reward = ZFilter((1,), demean=False, clip=10)
    acc_batch_rewards = []

    for i_episode in range(i_episodes):
        memory = Memory()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < batch_size:
            state = env.reset()
            state = running_state(state)

            reward_sum = 0
            for t in range(t_steps):
                action = agent.act(state)
                action = action.data[0].numpy()
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)

                mask = 0 if done else 1

                memory.push(state, np.array([action]), mask, next_state, reward)

                if render:
                    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t-1)
            num_episodes += 1
            reward_batch += reward_sum

        reward_batch /= num_episodes
        acc_batch_rewards.append(reward_batch)
        batch = memory.sample()
        update_params(batch,cur_targ_adv_func)

        if i_episode % log_interval == 0:
            print('Episode {}\tFunc name: {}\tAverage reward {:.2f}'.format(
                i_episode, cur_targ_adv_func.__name__, reward_batch))

    plt.title(cur_targ_adv_func.__name__)

    plt.plot(acc_batch_rewards)
    plt.show()

    all_acc_batch_rewards[cur_f_id].append(acc_batch_rewards)

    if targ_adv_funs_len > 1 and compare_plots and cur_f_id == targ_adv_funs_len-1:
        plt.title(cur_exp_id)
        for i in range(targ_adv_funs_len):
            plt.plot(all_acc_batch_rewards[i][-1],label=targ_adv_funs[i].__name__)
        plt.legend()
        plt.show()

  deprecation(
  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 0	Func name: get_trpo_tar_mean_adv	Average reward -1649.24


  0%|          | 0/9 [00:45<?, ?it/s]


KeyboardInterrupt: 

In [None]:
def save_array(fname, data):
    with open(fname, 'w') as f:
        print(data, file=f)

In [None]:
save_array('data/Human_Trpo_AtpoDelta_Atrpo0_100_0.txt', all_acc_batch_rewards)

In [None]:
all_acc_batch_rewards