In [1]:
from google.colab import drive
drive.mount('/content/drive/')

!cp "/content/drive/My Drive/Dissertation/envs/point_push.py" .

Mounted at /content/drive/


In [2]:
# for inference, not continued training
def save_model(model, name):
    path = f"/content/drive/My Drive/Dissertation/saved_models/point_push_pheromone/{name}" 

    torch.save({
      'meta_controller': model.pheromone_paths,
      'controller': {
          'critic': model.controller.critic.state_dict(),
          'actor': model.controller.actor.state_dict(),
      }
    }, path)

import copy
def load_model(model, name):
    path = f"/content/drive/My Drive/Dissertation/saved_models/point_push_pheromone/{name}" 
    checkpoint = torch.load(path)

    model.pheromone_paths = copy.deepcopy(checkpoint['meta_controller'])

    model.controller.critic.load_state_dict(checkpoint['controller']['critic'])
    model.controller.critic_target = copy.deepcopy(model.controller.critic)
    model.controller.actor.load_state_dict(checkpoint['controller']['actor'])
    model.controller.actor_target = copy.deepcopy(model.controller.actor)

    # model.eval() for evaluation instead
    model.eval()
    model.controller.eval()

In [3]:
%matplotlib inline

import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from IPython import display
plt.ion()

# if gpu is to be used
device = torch.device("cuda")

In [4]:
class NormalizedEnv(gym.ActionWrapper):
    """ Wrap action """

    def action(self, action):
        act_k = (self.action_space.high - self.action_space.low)/ 2.
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k * action + act_b

    def reverse_action(self, action):
        act_k_inv = 2./(self.action_space.high - self.action_space.low)
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k_inv * (action - act_b)

In [5]:
from point_push import PointPushEnv 
env = NormalizedEnv(PointPushEnv(4))

***

In [6]:
def plot_durations(episode_durations, goals_done):
    fig, axs = plt.subplots(2, figsize=(10,10))
    
    durations_t, durations = list(map(list, zip(*episode_durations)))
    durations = torch.tensor(durations, dtype=torch.float)
    
    fig.suptitle('Training')
    axs[0].set_xlabel('Episode')
    axs[0].set_ylabel('Reward')
    
    axs[0].plot(durations_t, durations.numpy())

    durations_t, durations = list(map(list, zip(*goals_done)))
    durations = torch.tensor(durations, dtype=torch.float)
    
    fig.suptitle('Training')
    axs[1].set_xlabel('Episode')
    axs[1].set_ylabel('Goals done')
    
    axs[1].plot(durations_t, durations.numpy())
        
    plt.pause(0.001)  # pause a bit so that plots are updated
    display.clear_output(wait=True)

In [7]:
# [reference] https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py

class RandomProcess(object):
    def reset_states(self):
        pass

class AnnealedGaussianProcess(RandomProcess):
    def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
        self.mu = mu
        self.sigma = sigma
        self.n_steps = 0

        if sigma_min is not None:
            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
            self.c = sigma
            self.sigma_min = sigma_min
        else:
            self.m = 0.
            self.c = sigma
            self.sigma_min = sigma

    @property
    def current_sigma(self):
        sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
        return sigma


# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
    def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
        super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
        self.theta = theta
        self.mu = mu
        self.dt = dt
        self.x0 = x0
        self.size = size
        self.reset_states()

    def sample(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
        self.x_prev = x
        self.n_steps += 1
        return x

    def reset_states(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)

In [8]:
def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - tau) + param.data * tau
        )

def hard_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)

In [9]:
# (state, action) -> (next_state, reward, done)
transition = namedtuple('transition', ('state', 'action', 'next_state', 'reward', 'done'))

# replay memory D with capacity N
class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    # implemented as a cyclical queue
    def store(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        
        self.memory[self.position] = transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
  
# (state, action) -> (next_state, reward, done)
transition_meta = namedtuple('transition', ('state', 'action', 'next_state', 'reward', 'done', 'state_seq', 'action_seq'))

# replay memory D with capacity N
class ReplayMemoryMeta(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    # implemented as a cyclical queue
    def store(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        
        self.memory[self.position] = transition_meta(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

***

In [10]:
DEPTH = 128

class Actor(nn.Module):
    def __init__(self, nb_states, nb_actions):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(nb_states, DEPTH)
        self.fc2 = nn.Linear(DEPTH, DEPTH)
        self.head = nn.Linear(DEPTH, nb_actions)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.head(x))

class Critic(nn.Module):
    def __init__(self, nb_states, nb_actions):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(nb_states + nb_actions, DEPTH)
        self.l2 = nn.Linear(DEPTH, DEPTH)
        self.l3 = nn.Linear(DEPTH, 1)

        # Q2 architecture
        self.l4 = nn.Linear(nb_states + nb_actions, DEPTH)
        self.l5 = nn.Linear(DEPTH, DEPTH)
        self.l6 = nn.Linear(DEPTH, 1)
    
    def forward(self, state, action):
        sa = torch.cat([state, action], 1).float()

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2

    def Q1(self, state, action):
        sa = torch.cat([state, action], 1).float()

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

In [11]:
BATCH_SIZE = 64
GAMMA = 0.99

# https://spinningup.openai.com/en/latest/algorithms/td3.html
class TD3(nn.Module):
    def __init__(self, nb_states, nb_actions, is_meta=False):
        super(TD3, self).__init__()
        self.nb_states = nb_states
        self.nb_actions= nb_actions
        
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optimizer  = optim.Adam(self.actor.parameters(), lr=0.0001)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optimizer  = optim.Adam(self.critic.parameters(), lr=0.0001)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)
        
        self.is_meta = is_meta

        #Create replay buffer
        self.memory = ReplayMemory(100000) if not self.is_meta else ReplayMemoryMeta(100000)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.15, mu=0.0, sigma=0.2)

        # Hyper-parameters
        self.tau = 0.005
        self.depsilon = 1.0 / 10000
        self.policy_noise=0.2
        self.noise_clip=0.5
        self.policy_freq=2
        self.total_it = 0

        # 
        self.epsilon = 1.0
        self.is_training = True

    def update_policy(self, off_policy_correction=None):
        if len(self.memory) < BATCH_SIZE:
            return

        self.total_it += 1
        
        # in the form (state, action) -> (next_state, reward, done)
        transitions = self.memory.sample(BATCH_SIZE)

        if not self.is_meta:
            batch = transition(*zip(*transitions))
            action_batch = torch.cat(batch.action)
        else:
            batch = transition_meta(*zip(*transitions))

            action_batch = torch.cat(batch.action)
            state_seq_batch = torch.stack(batch.state_seq)
            action_seq_batch = torch.stack(batch.action_seq)

            action_batch = off_policy_correction(action_batch.cpu().numpy(), state_seq_batch.cpu().numpy(), action_seq_batch.cpu().numpy())
        
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        reward_batch = torch.cat(batch.reward)
        done_mask = np.array(batch.done)
        not_done_mask = torch.from_numpy(1 - done_mask).float().to(device)

        # Target Policy Smoothing
        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (
                torch.randn_like(action_batch) * self.policy_noise
            ).clamp(-self.noise_clip, self.noise_clip).float()
            
            next_action = (
                self.actor_target(next_state_batch) + noise
            ).clamp(-1.0, 1.0).float()

            # Compute the target Q value
            # Clipped Double-Q Learning
            target_Q1, target_Q2 = self.critic_target(next_state_batch, next_action)
            target_Q = torch.min(target_Q1, target_Q2).squeeze(1)
            target_Q = (reward_batch + GAMMA * not_done_mask  * target_Q).float()
        
        # Critic update
        current_Q1, current_Q2 = self.critic(state_batch, action_batch)
      
        critic_loss = F.mse_loss(current_Q1, target_Q.unsqueeze(1)) + F.mse_loss(current_Q2, target_Q.unsqueeze(1))

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:
            # Compute actor loss
            actor_loss = -self.critic.Q1(state_batch, self.actor(state_batch)).mean()
            
            # Optimize the actor 
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # print losses
            #if self.total_it % (50 * 50 if self.is_meta else 500 * 50) == 0:
            #    print(f"{self.is_meta} controller;\n\tcritic loss: {critic_loss.item()}\n\tactor loss: {actor_loss.item()}")

            # Target update
            soft_update(self.actor_target, self.actor, self.tau)
            soft_update(self.critic_target, self.critic, 2 * self.tau / 5)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def observe(self, s_t, a_t, s_t1, r_t, done):
        self.memory.store(s_t, a_t, s_t1, r_t, done)

    def random_action(self):
        return torch.tensor([np.random.uniform(-1.,1.,self.nb_actions)], device=device, dtype=torch.float)

    def select_action(self, s_t, warmup, decay_epsilon):
        if warmup:
            return self.random_action()

        with torch.no_grad():
            action = self.actor(s_t).squeeze(0)
            #action += torch.from_numpy(self.is_training * max(self.epsilon, 0) * self.random_process.sample()).to(device).float()
            action += torch.from_numpy(self.is_training * max(self.epsilon, 0) * np.random.uniform(-1.,1.,1)).to(device).float()
            action = torch.clamp(action, -1., 1.)

            action = action.unsqueeze(0)
            
            if decay_epsilon:
                self.epsilon -= self.depsilon
            
            return action

In [12]:
from operator import itemgetter
class HIRO(nn.Module):
    def __init__(self, nb_states, nb_actions):
        super(HIRO, self).__init__()
        self.nb_states = nb_states
        self.nb_actions= nb_actions
        self.goal_dim = [0, 1]
        self.goal_dimen = 2

        # a list of tuple of form (reward, path); keep top 5
        self.pheromone_paths = []

        self.controller = TD3(nb_states + len(self.goal_dim), nb_actions).to(device)
        #self.controller.depsilon = 1.0 / 10000

    def add_path(self, reward, path):
        # prefer higher-reward paths
        self.pheromone_paths.append((reward, path))
        self.pheromone_paths.sort(key=itemgetter(0), reverse=True)
        self.pheromone_paths = self.pheromone_paths[:5] # only keep top 5

    def teach_controller(self):
        self.controller.update_policy()

    def h(self, state, goal, next_state):
        #return goal
        return state[:,self.goal_dim] + goal - next_state[:,self.goal_dim]
    #def intrinsic_reward(self, action, goal):
    #    return torch.tensor(1.0 if self.goal_reached(action, goal) else 0.0, device=device) 
    #def goal_reached(self, action, goal, threshold = 0.1):
    #    return torch.abs(action - goal) <= threshold
    def intrinsic_reward(self, reward, state, goal, next_state):
        #return torch.tensor(2 * reward if self.goal_reached(state, goal, next_state) else reward / 10, device=device) #reward / 2
        # just L2 norm
        return -torch.pow(sum(torch.pow(state.squeeze(0)[self.goal_dim] + goal.squeeze(0) - next_state.squeeze(0)[self.goal_dim], 2)), 0.5)
    def goal_reached(self, state, goal, next_state, threshold = 0.1):
        return torch.pow(sum(torch.pow(state.squeeze(0)[self.goal_dim] + goal.squeeze(0) - next_state.squeeze(0)[self.goal_dim], 2)), 0.5) <= threshold
        #return torch.pow(sum(goal.squeeze(0), 2), 0.5) <= threshold

    def observe_controller(self, s_t, a_t, s_t1, r_t, done):
        self.controller.memory.store(s_t, a_t, s_t1, r_t, done)

    def select_goal(self, s_t, warmup, is_training):
        if warmup or len(self.pheromone_paths) == 0:
            return torch.tensor([np.random.uniform(-1.,1.,len(self.goal_dim))], device=device, dtype=torch.float)
        
        time_index = 3
        #cur_t = s_t.squeeze(0)[time_index] # time
        cur_pos = s_t.squeeze(0)[self.goal_dim]

        goal = torch.tensor([0] * len(self.goal_dim), device=device, dtype=torch.float)

        min_rew = -60 # min(self.pheromone_paths, key = lambda t: t[0])[0]
        tot_rew = sum([t[0] for t in self.pheromone_paths]) - len(self.pheromone_paths) * min_rew

        for rew, path in self.pheromone_paths:
            breakdown = tuple(map(torch.stack, zip(*path)))
            positions = torch.stack([breakdown[i] for i in self.goal_dim], axis=-1)
            chosen_i = torch.argmin(torch.pow(torch.sum(torch.pow(positions - cur_pos, 2), axis=1), 0.5))
            
            # assume c = 10
            # basically, in chosen path, go 10 steps ahead from position closest
            # to the currently observed one
            chosen_point = path[min(chosen_i + 10, len(path) - 1)]

            #chosen_point = path[torch.argmin(torch.abs(breakdown[time_index] - cur_t))]
            goal += (rew - min_rew) * chosen_point[self.goal_dim]
        
        goal /= tot_rew
        goal = goal - s_t.squeeze(0)[self.goal_dim] # make goal relative to given position

        return goal.unsqueeze(0)

    def select_action(self, s_t, g_t, warmup, decay_epsilon):
        sg_t = torch.cat([s_t, g_t], 1).float()
        return self.controller.select_action(sg_t, warmup, decay_epsilon)

In [None]:
import time
SAVE_OFFSET = 5
def train_model():
    global SAVE_OFFSET
    n_observations = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]
    
    agent = HIRO(n_observations, n_actions).to(device)
    
    max_episode_length = 500
    observation = None
    
    warmup = 100
    num_episodes = 4000 # M
    episode_durations = []
    goal_durations = []

    steps = 0
    c = 10

    for i_episode in range(num_episodes):
        observation = env.reset()
        state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
        
        overall_reward = 0
        overall_intrinsic = 0
        episode_steps = 0
        done = False
        goals_done = 0

        state_seq = None

        while not done:
            goal = agent.select_goal(state, i_episode <= warmup, True)
            #goal_durations.append((steps, goal[:,0]))

            first_goal = goal
            goal_done = False
            total_extrinsic = 0

            while not done and not goal_done:
                joint_goal_state = torch.cat([state, goal], axis=1).float()

                # agent pick action ...
                action = agent.select_action(state, goal, i_episode <= warmup, True)
                
                # env response with next_observation, reward, terminate_info
                observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                steps += 1
                next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                next_goal = agent.h(state, goal, next_state)
                joint_next_state = torch.cat([next_state, next_goal], axis=1).float()
                
                if max_episode_length and episode_steps >= max_episode_length -1:
                    done = True
                    
                extrinsic_reward = torch.tensor([reward], device=device)
                intrinsic_reward = agent.intrinsic_reward(reward, state, goal, next_state).unsqueeze(0)
                #intrinsic_reward = agent.intrinsic_reward(action, goal).unsqueeze(0)

                overall_reward += reward
                total_extrinsic += reward
                overall_intrinsic += intrinsic_reward

                goal_reached = agent.goal_reached(state, goal, next_state)
                #goal_done = agent.goal_reached(action, goal)

                # agent observe and update policy
                agent.observe_controller(joint_goal_state, action, joint_next_state, intrinsic_reward, done) #goal_done.item())

                if state_seq is None:
                    state_seq = state
                else:
                    state_seq = torch.cat([state_seq, state])

                episode_steps += 1

                if goal_reached:
                    goals_done += 1
                
                if (episode_steps % c) == 0:
                    goal_done = True

                state = next_state
                goal = next_goal
                
                if i_episode > warmup:
                    agent.teach_controller()

        # once episode finishes, append full path to manager
        agent.add_path(overall_reward, state_seq)

        goal_durations.append((i_episode, overall_intrinsic / episode_steps))
        episode_durations.append((i_episode, overall_reward))
        #plot_durations(episode_durations, goal_durations)

        _, dur = list(map(list, zip(*episode_durations)))
        if len(dur) > 100:
            if i_episode % 100 == 0:
                print(f"{i_episode}: {np.mean(dur[-100:])}")
            if i_episode >= 300 and i_episode % 100 == 0 and np.mean(dur[-100:]) <= -49.0:
                print(f"Unlucky after {i_episode} eps! Terminating...")
                return None
            if np.mean(dur[-100:]) >= 90:
                print(f"Solved after {i_episode} episodes!")
                save_model(agent, f"hiro_{SAVE_OFFSET}")
                SAVE_OFFSET += 1
                return agent

    return None # did not train

In [None]:
state_max = torch.from_numpy(env.observation_space.high).to(device).float()
state_min = torch.from_numpy(env.observation_space.low).to(device).float()
state_mid = (state_max + state_min) / 2.
state_range = (state_max - state_min)
def eval_model(agent, episode_durations, goal_attack, action_attack, same_noise):
    agent.eval()
    agent.controller.eval()

    max_episode_length = 500
    agent.controller.is_training = False

    num_episodes = 100

    c = 10

    for l2norm in np.arange(0.0,0.51,0.05):
        
        overall_reward = 0
        for i_episode in range(num_episodes):
            observation = env.reset()

            state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
            g_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            noise = torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)

            if goal_attack:
                g_state = g_state + state_range * noise
                g_state = torch.max(torch.min(g_state, state_max), state_min).float()
            if action_attack:
                if same_noise:
                    state = state + state_range * noise
                else:
                    state = state + state_range * torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)
                state = torch.max(torch.min(state, state_max), state_min).float()

            episode_steps = 0
            done = False
            while not done:
                # select a goal
                goal = agent.select_goal(g_state, False, False)

                goal_done = False
                while not done and not goal_done:
                    action = agent.select_action(state, goal, False, False)
                    observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                    
                    next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                    g_next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

                    noise = torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)
                    if goal_attack:
                        g_next_state = g_next_state + state_range * noise
                        g_next_state = torch.max(torch.min(g_next_state, state_max), state_min).float()
                    if action_attack:
                        if same_noise:
                            next_state = next_state + state_range * noise
                        else:
                            next_state = next_state + state_range * torch.FloatTensor(next_state.shape).uniform_(-l2norm, l2norm).to(device)
                        next_state = torch.max(torch.min(next_state, state_max), state_min).float()

                    next_goal = agent.h(g_state, goal, g_next_state)
                                      
                    overall_reward += reward

                    if max_episode_length and episode_steps >= max_episode_length - 1:
                        done = True
                    episode_steps += 1

                    #goal_done = agent.goal_reached(action, goal)
                    goal_reached = agent.goal_reached(g_state, goal, g_next_state)

                    if (episode_steps % c) == 0:
                        goal_done = True

                    state = next_state
                    g_state = g_next_state
                    goal = next_goal

        episode_durations[np.round(l2norm, 2)].append(overall_reward / num_episodes)

In [13]:
state_max = torch.from_numpy(env.observation_space.high).to(device).float()
state_min = torch.from_numpy(env.observation_space.low).to(device).float()
state_mid = (state_max + state_min) / 2.
state_range = (state_max - state_min)
def fgsm_attack(data, eps, data_grad):
    sign_data_grad = data_grad.sign()

    perturbed_data = data - eps * sign_data_grad * state_range

    clipped_perturbed_data = torch.max(torch.min(perturbed_data, state_max), state_min)

    return clipped_perturbed_data

def fgsm_action(state, goal, agent, eps, target, targeted):
    #state = torch.tensor(state, requires_grad=True)
    state = state.clone().detach().requires_grad_(True)
    goal = goal.clone().detach()

    sg_t = torch.cat([state, goal], 1).float()

    if targeted:
        # initial forward pass
        action = agent.controller.actor(sg_t)
        action = torch.clamp(action, -1., 1.)

        loss = F.mse_loss(action, target)
    else:
        loss = agent.controller.critic.Q1(sg_t, agent.controller.actor(sg_t)).mean()

    agent.controller.actor.zero_grad()

    # calc loss
    loss.backward()
    data_grad = state.grad.data
    # perturb state
    state_p = fgsm_attack(state, eps, data_grad).float()
    return state_p

def apply_fgsm(agent, episode_durations, targeted):
    TARGET_ACTION = torch.tensor([[0.0, 0.0]], device=device, dtype=torch.float)

    agent.eval()
    agent.controller.eval()

    max_episode_length = 500
    agent.controller.is_training = False

    num_episodes = 100

    c = 10

    for eps in np.arange(0.0, 0.201, 0.02):

        overall_reward = 0
        for i_episode in range(num_episodes):
            observation = env.reset()

            og_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            goal = agent.select_goal(og_state, False, False)
            state = fgsm_action(og_state, goal, agent, eps, TARGET_ACTION, targeted)

            episode_steps = 0
            done = False
            while not done:
                goal = agent.select_goal(state, False, False)

                goal_done = False
                while not done and not goal_done:
                    action = agent.select_action(state, goal, False, False)
                    
                    observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())

                    next_og_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                    goal_temp = agent.h(state, goal, next_og_state)
                    next_state = fgsm_action(next_og_state, goal_temp, agent, eps, TARGET_ACTION, targeted)

                    next_goal = agent.h(state, goal, next_state)
                                      
                    overall_reward += reward

                    if max_episode_length and episode_steps >= max_episode_length - 1:
                        done = True
                    episode_steps += 1

                    #goal_done = agent.goal_reached(action, goal)
                    goal_reached = agent.goal_reached(state, goal, next_state)

                    if (episode_steps % c) == 0:
                        goal_done = True

                    state = next_state
                    goal = next_goal

        episode_durations[eps].append(overall_reward / num_episodes)

In [14]:
noise_hrl = {'both': {}, 'action_only': {}, 'goal_only': {}, 'both_same': {}}
for l2norm in np.arange(0,0.51,0.05):
    for i in [noise_hrl['both'], noise_hrl['action_only'], noise_hrl['goal_only'], noise_hrl['both_same']]:
        i[np.round(l2norm, 2)] = []

targeted = {'goal': {}, 'action': {}}
untargeted = {'goal': {}, 'action': {}}
for eps in np.arange(0.0, 0.201, 0.02):
    for x in ['goal', 'action']:
        targeted[x][eps] = []
        untargeted[x][eps] = []

n_observations = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

#i = 2
#while i < 3:
#    #agent = train_model()
#    agent = HIRO(n_observations, n_actions).to(device)
#    load_model(agent, f"hiro_{i}")

#    if agent is not None:
#        # goal_attack, action_attack, same_noise
#        eval_model(agent, noise_hrl['both_same'], True, True, True)
#        eval_model(agent, noise_hrl['both'], True, True, False)
#        eval_model(agent, noise_hrl['action_only'], False, True, False)
#        eval_model(agent, noise_hrl['goal_only'], True, False, False)
#        print(f"{i} noise_hrl: {noise_hrl}")
#        i += 1

#print("----")
#print(f"noise_hrl: {noise_hrl}")

untargeted = {'goal': {0.0: [], 0.02: [], 0.04: [], 0.06: [], 0.08: [], 0.1: [], 0.12: [], 0.14: [], 0.16: [], 0.18: [], 0.2: []}, 'action': {0.0: [95.18899999999152, 89.67999999998972, 75.64099999997917, 83.00299999998151, 80.18599999997733, 93.33199999998824], 0.02: [86.62999999998618, 81.11999999998467, -23.037999999984013, 22.224000000013348, 72.01799999998593, 34.57400000001203], 0.04: [-0.5440000000040107, 34.67400000001282, -45.76999999998847, 28.868000000020658, 13.01500000000504, -35.68199999997282], 0.06: [-0.2730000000045658, -37.04599999996998, -44.488999999982894, -42.93199999997945, -50.00000000000659, -50.00000000000659], 0.08: [-24.462999999975608, -50.00000000000659, -42.8689999999761, -47.213000000000086, -50.00000000000659, -50.00000000000659], 0.1: [-40.32199999997019, -50.00000000000659, -36.154999999971665, -50.00000000000659, -50.00000000000659, -50.00000000000659], 0.12: [-11.535999999993159, -50.00000000000659, -44.45599999998733, -48.56800000000047, -50.00000000000659, -48.541000000000366], 0.14: [-26.428999999979414, -50.00000000000659, -38.88099999997294, -50.00000000000659, -50.00000000000659, -40.08599999996813], 0.16: [-12.006999999999305, -50.00000000000659, -41.53699999997295, -50.00000000000659, -50.00000000000659, -41.614999999973364], 0.18: [-12.700999999982523, -50.00000000000659, -45.79099999998854, -47.80699999999771, -50.00000000000659, -44.75899999998388], 0.2: [-3.957000000004028, -47.61999999999725, -36.6349999999719, -47.80900000000134, -50.00000000000659, -46.140999999989816]}}
targeted = {'goal': {0.0: [], 0.02: [], 0.04: [], 0.06: [], 0.08: [], 0.1: [], 0.12: [], 0.14: [], 0.16: [], 0.18: [], 0.2: []}, 'action': {0.0: [95.33599999999153, 88.06399999998811, 80.60999999998089, 83.96199999997744, 76.57599999997275], 0.02: [80.5379999999728, 74.82499999997725, 50.530999999998684, 76.39099999997752, 27.687000000005913], 0.04: [-1.5880000000029943, 36.98700000001531, 18.739000000014535, 42.937000000014834, 13.194000000007344], 0.06: [-24.78999999997478, 16.978000000021506, -15.409999999997668, 8.34600000000472, -27.826999999976724], 0.08: [-43.63299999997973, -25.104999999983782, -42.2119999999728, -33.23599999997118, -30.40699999997626], 0.1: [-38.40399999997239, -30.274999999971826, -39.186999999971114, -42.093999999976916, -23.436999999980834], 0.12: [-47.507999999996834, -30.80699999997619, -45.98399999999152, -44.54499999998434, -42.20499999997394], 0.14: [-41.70599999997346, -36.409999999969884, -42.16099999997553, -50.00000000000659, -43.20099999998036], 0.16: [-43.766999999982275, -36.17499999997299, -50.00000000000659, -50.00000000000659, -32.940999999974856], 0.18: [-50.00000000000659, -40.394999999970125, -50.00000000000659, -50.00000000000659, -27.72099999997675], 0.2: [-47.41699999999537, -47.23700000000381, -50.00000000000659, -48.70400000000096, -7.892000000002875]}}

i = 5
while i < 7:
    #agent = train_model()
    agent = HIRO(n_observations, n_actions).to(device)
    load_model(agent, f"hiro_{i}")

    if agent is not None:
        if i != 5:
            apply_fgsm(agent, untargeted['action'], False)   
        print(f"{i} fgsm (ut): {untargeted}")

        apply_fgsm(agent, targeted['action'], True)   
        print(f"{i} fgsm (t): {targeted}")
        i += 1

print("----")
print(f"fgsm (ut): {untargeted}")
print(f"fgsm (t): {targeted}")

5 fgsm (ut): {'goal': {0.0: [], 0.02: [], 0.04: [], 0.06: [], 0.08: [], 0.1: [], 0.12: [], 0.14: [], 0.16: [], 0.18: [], 0.2: []}, 'action': {0.0: [95.18899999999152, 89.67999999998972, 75.64099999997917, 83.00299999998151, 80.18599999997733, 93.33199999998824], 0.02: [86.62999999998618, 81.11999999998467, -23.037999999984013, 22.224000000013348, 72.01799999998593, 34.57400000001203], 0.04: [-0.5440000000040107, 34.67400000001282, -45.76999999998847, 28.868000000020658, 13.01500000000504, -35.68199999997282], 0.06: [-0.2730000000045658, -37.04599999996998, -44.488999999982894, -42.93199999997945, -50.00000000000659, -50.00000000000659], 0.08: [-24.462999999975608, -50.00000000000659, -42.8689999999761, -47.213000000000086, -50.00000000000659, -50.00000000000659], 0.1: [-40.32199999997019, -50.00000000000659, -36.154999999971665, -50.00000000000659, -50.00000000000659, -50.00000000000659], 0.12: [-11.535999999993159, -50.00000000000659, -44.45599999998733, -48.56800000000047, -50.000000

In [None]:
Solved after 1257 episodes!
0 noise_hrl: {'both': {0.0: [95.2909999999919], 0.05: [78.10399999997867], 0.1: [44.27800000000538], 0.15: [31.345000000021365], 0.2: [10.017999999994569], 0.25: [-5.561000000002012], 0.3: [-18.698999999982536], 0.35: [-25.83199999997743], 0.4: [-27.708999999979145], 0.45: [-22.693999999986094], 0.5: [-27.021999999978583]}, 'action_only': {0.0: [95.30199999999166], 0.05: [81.63899999998573], 0.1: [63.00299999999265], 0.15: [33.00200000000814], 0.2: [25.028000000005797], 0.25: [17.199000000011623], 0.3: [26.37000000002001], 0.35: [11.992000000000896], 0.4: [-0.4559999999992143], 0.45: [-13.065999999994723], 0.5: [-29.70299999997987]}, 'goal_only': {0.0: [95.33499999999157], 0.05: [90.04299999999445], 0.1: [73.14399999997853], 0.15: [54.07999999999349], 0.2: [25.11900000001559], 0.25: [12.64499999999871], 0.3: [8.581999999994709], 0.35: [-10.018000000003735], 0.4: [-24.129999999976963], 0.45: [-30.950999999975465], 0.5: [-35.315999999973094]}, 'both_same': {0.0: [93.81199999998995], 0.05: [76.42699999998078], 0.1: [57.40299999998928], 0.15: [18.770000000001545], 0.2: [2.80199999999555], 0.25: [-4.572000000007936], 0.3: [0.4539999999988095], 0.35: [-12.586999999999215], 0.4: [-16.563999999982933], 0.45: [-30.467999999971653], 0.5: [-18.905999999988236]}}
Solved after 1360 episodes!
1 noise_hrl: {'both': {0.0: [86.63399999998369], 0.05: [82.4339999999887], 0.1: [48.590999999998445], 0.15: [-6.330000000007147], 0.2: [-12.051000000001752], 0.25: [-18.417999999989856], 0.3: [-26.297999999979716], 0.35: [-22.90399999998143], 0.4: [-20.922999999981734], 0.45: [-29.68999999997741], 0.5: [-32.62399999997568]}, 'action_only': {0.0: [88.52299999998493], 0.05: [83.18199999998681], 0.1: [88.6749999999834], 0.15: [73.4799999999855], 0.2: [64.80199999998943], 0.25: [55.62199999998642], 0.3: [20.42600000001088], 0.35: [-11.299999999992702], 0.4: [-18.852999999979705], 0.45: [-19.789999999987227], 0.5: [-21.36899999997521]}, 'goal_only': {0.0: [91.29999999998626], 0.05: [81.58099999998245], 0.1: [43.439000000014985], 0.15: [20.00900000000222], 0.2: [-2.10600000000459], 0.25: [-4.129000000006505], 0.3: [-0.44200000000631307], 0.35: [-3.101000000000928], 0.4: [-6.752000000003262], 0.45: [-12.037000000005241], 0.5: [-12.719999999996539]}, 'both_same': {0.0: [88.69399999998515], 0.05: [84.79799999998917], 0.1: [58.726999999982375], 0.15: [29.536000000018795], 0.2: [7.198999999998173], 0.25: [-26.13499999998094], 0.3: [-16.663999999985393], 0.35: [-18.52199999997714], 0.4: [-30.104999999975707], 0.45: [-28.27699999997295], 0.5: [-29.499999999978172]}}
Solved after 2621 episodes!
2 noise_hrl: {'both': {0.0: [81.65099999998483], 0.05: [49.85999999999136], 0.1: [22.49100000001333], 0.15: [-13.738999999998386], 0.2: [-25.728999999977283], 0.25: [-27.732999999973877], 0.3: [-33.13999999997524], 0.35: [-35.65599999997148], 0.4: [-29.472999999974252], 0.45: [-32.170999999973176], 0.5: [-33.186999999969174]}, 'action_only': {0.0: [82.03999999998227], 0.05: [83.99899999998745], 0.1: [57.29799999998582], 0.15: [57.68099999998702], 0.2: [43.00400000001679], 0.25: [33.19900000001503], 0.3: [23.431000000017658], 0.35: [18.658000000008634], 0.4: [6.737999999997974], 0.45: [4.555999999996748], 0.5: [-5.530000000008207]}, 'goal_only': {0.0: [75.19699999998303], 0.05: [38.43900000001104], 0.1: [-3.0650000000038835], 0.15: [-10.748000000007853], 0.2: [-26.215999999983197], 0.25: [-39.665999999970445], 0.3: [-36.41399999997078], 0.35: [-39.26399999997114], 0.4: [-41.12899999997113], 0.45: [-36.7639999999707], 0.5: [-34.48399999997185]}, 'both_same': {0.0: [79.49899999997146], 0.05: [40.52100000001273], 0.1: [10.970999999997346], 0.15: [1.775000000000408], 0.2: [-29.695999999974507], 0.25: [-24.490999999982684], 0.3: [-27.617999999985596], 0.35: [-30.87099999997583], 0.4: [-21.266999999981557], 0.45: [-30.762999999976156], 0.5: [-35.21999999997106]}}
Solved after 2506 episodes!
Solved after 1324 episodes!
4 noise_hrl: {'both': {0.0: [82.86099999997796, 75.23399999997478], 0.05: [78.31099999998806, 62.16699999998617], 0.1: [30.617000000012446, 19.892000000011258], 0.15: [-1.2210000000053645, -22.586999999981728], 0.2: [-23.570999999977563, -31.298999999975678], 0.25: [-23.160999999976134, -36.672999999970976], 0.3: [-41.75299999997135, -35.48099999997301], 0.35: [-40.52799999997053, -33.084999999974045], 0.4: [-45.146999999987564, -37.34599999996906], 0.45: [-43.82799999998813, -40.477999999968], 0.5: [-44.001999999984534, -35.39999999996874]}, 'action_only': {0.0: [81.22799999998507, 72.79999999997713], 0.05: [89.05199999998824, 91.31099999998806], 0.1: [71.9259999999859, 72.30499999997622], 0.15: [54.288999999984114, 60.3739999999854], 0.2: [59.48699999998503, 42.19500000000787], 0.25: [56.025999999992834, 27.151000000020826], 0.3: [42.60500000001395, 15.609000000014621], 0.35: [36.99200000001656, 24.121000000015147], 0.4: [16.721999999999962, 23.24300000002076], 0.45: [23.018000000020237, 9.564999999992487], 0.5: [14.391000000004976, 20.777000000005632]}, 'goal_only': {0.0: [82.85999999997725, 78.71599999998139], 0.05: [79.83699999997657, 51.01599999999151], 0.1: [32.23300000001611, 16.69900000001225], 0.15: [-11.505000000000953, -34.894999999971894], 0.2: [-34.40299999997073, -29.682999999978982], 0.25: [-45.830999999989714, -42.20999999997546], 0.3: [-44.416999999986274, -34.1969999999761], 0.35: [-46.510999999992016, -37.988999999970495], 0.4: [-50.00000000000659, -36.57599999997542], 0.45: [-50.00000000000659, -43.21899999997737], 0.5: [-50.00000000000659, -34.26899999997667]}, 'both_same': {0.0: [80.10399999998602, 70.68999999998108], 0.05: [74.40699999999315, 46.68199999999807], 0.1: [36.307000000017425, 18.212000000001336], 0.15: [9.20200000000499, -16.53499999998371], 0.2: [-9.064000000004832, -21.708999999982115], 0.25: [-28.87299999997671, -31.757999999975638], 0.3: [-39.68999999997099, -35.03499999997118], 0.35: [-35.945999999973395, -26.709999999977757], 0.4: [-36.7849999999695, -26.615999999981142], 0.45: [-37.229999999969, -27.818999999977986], 0.5: [-41.885999999978864, -28.858999999979964]}}
Solved after 2153 episodes!
Solved after 1102 episodes!
6 noise_hrl: {'both': {0.0: [93.59099999998845, 95.82699999999268], 0.05: [51.135999999978964, 25.124000000013538], 0.1: [-3.143000000003118, 5.1549999999992835], 0.15: [-24.03099999997523, -14.66699999999627], 0.2: [-19.47699999998875, -3.477000000002735], 0.25: [-27.6629999999764, -5.881000000002434], 0.3: [-20.65099999997652, -14.382999999983559], 0.35: [-39.82999999996838, -15.63699999999387], 0.4: [-42.39499999997662, -19.943999999991235], 0.45: [-35.70699999997134, -30.742999999973076], 0.5: [-31.962999999976546, -32.9909999999714]}, 'action_only': {0.0: [93.60699999998842, 95.48499999999072], 0.05: [85.97899999998273, 50.60600000000608], 0.1: [74.31699999998244, 27.577000000017225], 0.15: [56.92299999998438, 12.710000000005227], 0.2: [38.09700000001057, 24.54500000001763], 0.25: [20.017000000006522, 42.660000000012175], 0.3: [3.1530000000005725, 40.35600000001543], 0.35: [8.061999999995978, 18.135000000001487], 0.4: [2.0989999999998266, -3.8560000000063726], 0.45: [2.586999999999339, -3.2130000000009757], 0.5: [-10.934000000000491, -24.534999999976478]}, 'goal_only': {0.0: [93.91699999999013, 95.6009999999929], 0.05: [56.24699999999941, 41.92300000000236], 0.1: [18.88300000000399, -2.1600000000025203], 0.15: [-5.391000000005116, -4.570000000006349], 0.2: [-15.927999999981886, -13.373999999999848], 0.25: [-33.416999999974216, -14.340999999990078], 0.3: [-41.385999999971155, -18.89999999999105], 0.35: [-33.943999999975084, -21.92799999997983], 0.4: [-37.42599999997123, -26.471999999983495], 0.45: [-35.70099999996956, -18.90099999998466], 0.5: [-36.912999999972456, -23.31499999998268]}, 'both_same': {0.0: [93.33499999998924, 95.807999999993], 0.05: [41.81300000000162, 5.360999999998208], 0.1: [6.667999999997147, -1.8880000000038668], 0.15: [-13.700999999994508, -10.656999999996817], 0.2: [-33.60599999997231, 0.9379999999962118], 0.25: [-29.283999999974498, -19.04299999997892], 0.3: [-28.807999999981057, -13.233000000000207], 0.35: [-32.12299999997394, -16.607999999986767], 0.4: [-38.43399999997072, -30.036999999973823], 0.45: [-28.691999999977444, -32.42299999997433], 0.5: [-25.25299999997437, -32.53799999997977]}}

In [None]:
def eval_scale(agent, episode_durations):
    agent.eval()
    agent.controller.eval()

    max_episode_length = 500
    agent.controller.is_training = False

    num_episodes = 100

    c = 10

    for scale in np.arange(1.0,7.01,0.5):
        env = NormalizedEnv(PointPushEnv(scale))

        overall_reward = 0
        for i_episode in range(num_episodes):
            observation = env.reset()

            state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
            g_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            episode_steps = 0
            done = False
            while not done:
                # select a goal
                goal = agent.select_goal(g_state, False, False)

                goal_done = False
                while not done and not goal_done:
                    action = agent.select_action(state, goal, False, False)
                    observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                    
                    next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                    g_next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

                    next_goal = agent.h(g_state, goal, g_next_state)
                                      
                    overall_reward += reward

                    if max_episode_length and episode_steps >= max_episode_length - 1:
                        done = True
                    episode_steps += 1

                    #goal_done = agent.goal_reached(action, goal)
                    goal_reached = agent.goal_reached(g_state, goal, g_next_state)

                    if (episode_steps % c) == 0:
                        goal_done = True

                    state = next_state
                    g_state = g_next_state
                    goal = next_goal

        episode_durations[np.round(scale, 2)].append(overall_reward / num_episodes)

In [None]:
episodes = {}
for scale in np.arange(1.0,7.01,0.5):
    episodes[np.round(scale, 2)] = []

n_observations = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

i = 0
while i < 7:
    #agent = train_model()
    agent = HIRO(n_observations, n_actions).to(device)
    load_model(agent, f"hiro_{i}")

    if agent is not None:
        # goal_attack, action_attack, same_noise
        eval_scale(agent, episodes)
        print(f"{i} scale: {episodes}")
        i += 1

print("----")
print(f"scale: {episodes}")

0 scale: {1.0: [-5.231000000001368], 1.5: [24.51400000001438], 2.0: [80.13999999998688], 2.5: [61.24399999997684], 3.0: [86.9359999999865], 3.5: [93.56499999998947], 4.0: [93.77999999999241], 4.5: [94.09699999998963], 5.0: [93.19999999998832], 5.5: [93.25999999998812], 6.0: [92.19599999998658], 6.5: [91.98799999998619], 7.0: [91.98599999998616]}
1 scale: {1.0: [-5.231000000001368, -41.4489999999788], 1.5: [24.51400000001438, -15.471999999991793], 2.0: [80.13999999998688, 50.61200000000366], 2.5: [61.24399999997684, -25.99699999997498], 3.0: [86.9359999999865, 15.413000000012413], 3.5: [93.56499999998947, 40.74700000001278], 4.0: [93.77999999999241, 91.10499999998774], 4.5: [94.09699999998963, 91.35999999998494], 5.0: [93.19999999998832, 90.33699999998349], 5.5: [93.25999999998812, 92.07499999998652], 6.0: [92.19599999998658, 91.15699999998552], 6.5: [91.98799999998619, 82.28099999997453], 7.0: [91.98599999998616, 89.50099999998233]}
2 scale: {1.0: [-5.231000000001368, -41.4489999999788

In [None]:
def eval_starting_position(agent, episode_durations):
    agent.eval()
    agent.controller.eval()

    max_episode_length = 500
    agent.controller.is_training = False

    num_episodes = 100

    c = 10

    for extra_range in np.arange(0.0, 0.401, 0.05):
        
        overall_reward = 0
        for i_episode in range(num_episodes):
            observation = env.reset()

            extra = np.random.uniform(-0.1 - extra_range, 0.1 + extra_range, env.starting_point.shape)
            #extra = np.random.uniform(0.1, 0.1 + extra_range, env.starting_point.shape)
            #extra = extra * (2*np.random.randint(0,2,size=env.starting_point.shape)-1)
            env.unwrapped.state = np.array(env.starting_point + extra, dtype=np.float32)
            env.unwrapped.state[2] += math.pi / 2. # start facing up
            env.unwrapped.state[2] = env.state[2] % (2 * math.pi)
            observation = env.normalised_state()

            state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
            g_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            episode_steps = 0
            done = False
            while not done:
                # select a goal
                goal = agent.select_goal(g_state, False, False)

                goal_done = False
                while not done and not goal_done:
                    action = agent.select_action(state, goal, False, False)
                    observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                    
                    next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                    g_next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

                    next_goal = agent.h(g_state, goal, g_next_state)
                                      
                    overall_reward += reward

                    if max_episode_length and episode_steps >= max_episode_length - 1:
                        done = True
                    episode_steps += 1

                    #goal_done = agent.goal_reached(action, goal)
                    goal_reached = agent.goal_reached(g_state, goal, g_next_state)

                    if (episode_steps % c) == 0:
                        goal_done = True

                    state = next_state
                    g_state = g_next_state
                    goal = next_goal

        episode_durations[np.round(extra_range, 3)].append(overall_reward / num_episodes)

In [None]:
episodes = {}
for extra_range in np.arange(0.0, 0.401, 0.05):
    episodes[np.round(extra_range, 3)] = []

n_observations = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

i = 0
while i < 7:
    #agent = train_model()
    agent = HIRO(n_observations, n_actions).to(device)
    load_model(agent, f"hiro_{i}")

    if agent is not None:
        # goal_attack, action_attack, same_noise
        eval_starting_position(agent, episodes)
        print(f"{i} range: {episodes}")
        i += 1

print("----")
print(f"range: {episodes}")

0 range: {0.0: [95.22699999999162], 0.05: [93.84499999999167], 0.1: [89.4839999999894], 0.15: [89.49099999998742], 0.2: [72.07999999998204], 0.25: [74.93799999999125], 0.3: [45.68000000000614], 0.35: [41.407000000009646], 0.4: [24.016000000002734]}
1 range: {0.0: [95.22699999999162, 88.05799999998773], 0.05: [93.84499999999167, 74.29299999997713], 0.1: [89.4839999999894, 82.50699999998378], 0.15: [89.49099999998742, 79.86399999998241], 0.2: [72.07999999998204, 78.475999999983], 0.25: [74.93799999999125, 72.68499999999194], 0.3: [45.68000000000614, 75.27999999997718], 0.35: [41.407000000009646, 64.39199999999022], 0.4: [24.016000000002734, 55.416999999982934]}
2 range: {0.0: [95.22699999999162, 88.05799999998773, 88.1219999999847], 0.05: [93.84499999999167, 74.29299999997713, 88.43099999998874], 0.1: [89.4839999999894, 82.50699999998378, 70.98699999998067], 0.15: [89.49099999998742, 79.86399999998241, 75.2909999999852], 0.2: [72.07999999998204, 78.475999999983, 75.63799999997518], 0.25:

In [None]:
state_max = torch.from_numpy(env.observation_space.high).to(device).float()
state_min = torch.from_numpy(env.observation_space.low).to(device).float()
state_mid = (state_max + state_min) / 2.
state_range = (state_max - state_min)
def save_trajectories(agent, episode_durations, dirty):
    agent.eval()
    agent.meta_controller.eval()
    agent.controller.eval()

    max_episode_length = 500
    agent.meta_controller.is_training = False
    agent.controller.is_training = False

    num_episodes = 10

    c = 10

    l2norm = 0.3
    episode_durations.append([])
    
    for i_episode in range(num_episodes):
        path = {"overall_reward": 0, "manager": [], "worker": []}

        observation = env.reset()

        state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
        state_ = torch.from_numpy(observation).float().unsqueeze(0).to(device)
        g_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
        g_state_ = torch.from_numpy(observation).float().unsqueeze(0).to(device)

        noise = torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)

        if dirty:
            g_state = g_state + state_range * noise
            g_state = torch.max(torch.min(g_state, state_max), state_min).float()
        if dirty:
            state = state + state_range * torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)
            state = torch.max(torch.min(state, state_max), state_min).float()

        episode_steps = 0
        overall_reward = 0
        done = False
        while not done:
            # select a goal
            goal = agent.select_goal(g_state, False, False)
            path["manager"].append((episode_steps, g_state_.detach().cpu().squeeze(0).numpy(), goal.detach().cpu().squeeze(0).numpy()))

            goal_done = False
            while not done and not goal_done:
                action = agent.select_action(state, goal, False, False)
                path["worker"].append((episode_steps, torch.cat([state_, goal], 1).detach().cpu().squeeze(0).numpy(), action.detach().cpu().squeeze(0).numpy()))
                observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                
                next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                g_next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                state_ = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                g_state_ = torch.from_numpy(observation).float().unsqueeze(0).to(device)

                noise = torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)
                if dirty:
                    g_next_state = g_next_state + state_range * noise
                    g_next_state = torch.max(torch.min(g_next_state, state_max), state_min).float()
                if dirty:
                    next_state = next_state + state_range * torch.FloatTensor(next_state.shape).uniform_(-l2norm, l2norm).to(device)
                    next_state = torch.max(torch.min(next_state, state_max), state_min).float()

                next_goal = agent.h(g_state, goal, g_next_state)
                                  
                overall_reward += reward

                if max_episode_length and episode_steps >= max_episode_length - 1:
                    done = True
                episode_steps += 1

                #goal_done = agent.goal_reached(action, goal)
                goal_reached = agent.goal_reached(g_state, goal, g_next_state)

                if (episode_steps % c) == 0:
                    goal_done = True

                state = next_state
                g_state = g_next_state
                goal = next_goal

        path["overall_reward"] = overall_reward
        episode_durations[-1].append(path)

In [None]:
episodes = []

n_observations = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

i = 0
while i < 6:
    #agent = train_model()
    agent = HIRO(n_observations, n_actions).to(device)
    load_model(agent, f"hiro_{i}")

    if agent is not None:
        # goal_attack, action_attack, same_noise
        save_trajectories(agent, episodes, True)
        #print(f"{i} paths: {episodes}")
        i += 1

print("----")
#print(f"paths: {episodes}")

episodes.pop(1)
episodes.pop(5 - 1)
episodes.pop(11 - 2)

torch.save(episodes, "PointPush_dirty_eps.pt")

----
