In [1]:
from google.colab import drive
drive.mount('/content/drive/')

!cp "/content/drive/My Drive/Dissertation/envs/point_fall.py" .

Mounted at /content/drive/


In [2]:
# for inference, not continued training
def save_model(model, name):
    path = f"/content/drive/My Drive/Dissertation/saved_models/point_fall_hdqn/{name}" 

    torch.save({
      'meta_controller': model.meta_controller.state_dict(),
      'controller': {
          'critic': model.controller.critic.state_dict(),
          'actor': model.controller.actor.state_dict(),
      }
    }, path)

import copy
def load_model(model, name):
    path = f"/content/drive/My Drive/Dissertation/saved_models/point_fall_hdqn/{name}" 
    checkpoint = torch.load(path)

    model.meta_controller.load_state_dict(checkpoint['meta_controller'])
    model.meta_controller_target = copy.deepcopy(model.meta_controller)

    model.controller.critic.load_state_dict(checkpoint['controller']['critic'])
    model.controller.critic_target = copy.deepcopy(model.controller.critic)
    model.controller.actor.load_state_dict(checkpoint['controller']['actor'])
    model.controller.actor_target = copy.deepcopy(model.controller.actor)

    # model.eval() for evaluation instead
    model.eval()
    model.meta_controller.eval()
    model.controller.eval()

In [3]:
%matplotlib inline

import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from IPython import display
plt.ion()

# if gpu is to be used
device = torch.device("cuda")

In [4]:
class NormalizedEnv(gym.ActionWrapper):
    """ Wrap action """

    def action(self, action):
        act_k = (self.action_space.high - self.action_space.low)/ 2.
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k * action + act_b

    def reverse_action(self, action):
        act_k_inv = 2./(self.action_space.high - self.action_space.low)
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k_inv * (action - act_b)

In [5]:
from point_fall import PointFallEnv 
env = NormalizedEnv(PointFallEnv(4))

***

In [6]:
# [reference] https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py

class RandomProcess(object):
    def reset_states(self):
        pass

class AnnealedGaussianProcess(RandomProcess):
    def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
        self.mu = mu
        self.sigma = sigma
        self.n_steps = 0

        if sigma_min is not None:
            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
            self.c = sigma
            self.sigma_min = sigma_min
        else:
            self.m = 0.
            self.c = sigma
            self.sigma_min = sigma

    @property
    def current_sigma(self):
        sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
        return sigma


# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
    def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
        super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
        self.theta = theta
        self.mu = mu
        self.dt = dt
        self.x0 = x0
        self.size = size
        self.reset_states()

    def sample(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
        self.x_prev = x
        self.n_steps += 1
        return x

    def reset_states(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)

In [7]:
def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - tau) + param.data * tau
        )

def hard_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)

In [8]:
# (state, action) -> (next_state, reward, done)
transition = namedtuple('transition', ('state', 'action', 'next_state', 'reward', 'done'))

# replay memory D with capacity N
class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    # implemented as a cyclical queue
    def store(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        
        self.memory[self.position] = transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
  
# (state, action) -> (next_state, reward, done)
transition_meta = namedtuple('transition', ('state', 'action', 'next_state', 'reward', 'done', 'state_seq', 'action_seq'))

# replay memory D with capacity N
class ReplayMemoryMeta(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    # implemented as a cyclical queue
    def store(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        
        self.memory[self.position] = transition_meta(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

***

In [9]:
DEPTH = 128

class Actor(nn.Module):
    def __init__(self, nb_states, nb_actions):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(nb_states, DEPTH)
        self.fc2 = nn.Linear(DEPTH, DEPTH)
        self.head = nn.Linear(DEPTH, nb_actions)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.head(x))

class Critic(nn.Module):
    def __init__(self, nb_states, nb_actions):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(nb_states + nb_actions, DEPTH)
        self.l2 = nn.Linear(DEPTH, DEPTH)
        self.l3 = nn.Linear(DEPTH, 1)

        # Q2 architecture
        self.l4 = nn.Linear(nb_states + nb_actions, DEPTH)
        self.l5 = nn.Linear(DEPTH, DEPTH)
        self.l6 = nn.Linear(DEPTH, 1)
    
    def forward(self, state, action):
        sa = torch.cat([state, action], 1).float()

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2

    def Q1(self, state, action):
        sa = torch.cat([state, action], 1).float()

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

In [10]:
BATCH_SIZE = 64
GAMMA = 0.99

# https://spinningup.openai.com/en/latest/algorithms/td3.html
class TD3(nn.Module):
    def __init__(self, nb_states, nb_actions, is_meta=False):
        super(TD3, self).__init__()
        self.nb_states = nb_states
        self.nb_actions= nb_actions
        
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optimizer  = optim.Adam(self.actor.parameters(), lr=0.0001)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optimizer  = optim.Adam(self.critic.parameters(), lr=0.0001)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)
        
        self.is_meta = is_meta

        #Create replay buffer
        self.memory = ReplayMemory(100000) if not self.is_meta else ReplayMemoryMeta(100000)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.15, mu=0.0, sigma=0.2)

        # Hyper-parameters
        self.tau = 0.005
        self.depsilon = 1.0 / 20000
        self.policy_noise=0.2
        self.noise_clip=0.5
        self.policy_freq=2
        self.total_it = 0

        # 
        self.epsilon = 1.0
        self.is_training = True

    def update_policy(self, off_policy_correction=None):
        if len(self.memory) < BATCH_SIZE:
            return

        self.total_it += 1
        
        # in the form (state, action) -> (next_state, reward, done)
        transitions = self.memory.sample(BATCH_SIZE)

        if not self.is_meta:
            batch = transition(*zip(*transitions))
            action_batch = torch.cat(batch.action)
        else:
            batch = transition_meta(*zip(*transitions))

            action_batch = torch.cat(batch.action)
            state_seq_batch = torch.stack(batch.state_seq)
            action_seq_batch = torch.stack(batch.action_seq)

            action_batch = off_policy_correction(action_batch.cpu().numpy(), state_seq_batch.cpu().numpy(), action_seq_batch.cpu().numpy())
        
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        reward_batch = torch.cat(batch.reward)
        done_mask = np.array(batch.done)
        not_done_mask = torch.from_numpy(1 - done_mask).float().to(device)

        # Target Policy Smoothing
        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (
                torch.randn_like(action_batch) * self.policy_noise
            ).clamp(-self.noise_clip, self.noise_clip).float()
            
            next_action = (
                self.actor_target(next_state_batch) + noise
            ).clamp(-1.0, 1.0).float()

            # Compute the target Q value
            # Clipped Double-Q Learning
            target_Q1, target_Q2 = self.critic_target(next_state_batch, next_action)
            target_Q = torch.min(target_Q1, target_Q2).squeeze(1)
            target_Q = (reward_batch + GAMMA * not_done_mask  * target_Q).float()
        
        # Critic update
        current_Q1, current_Q2 = self.critic(state_batch, action_batch)
      
        critic_loss = F.mse_loss(current_Q1, target_Q.unsqueeze(1)) + F.mse_loss(current_Q2, target_Q.unsqueeze(1))

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:
            # Compute actor loss
            actor_loss = -self.critic.Q1(state_batch, self.actor(state_batch)).mean()
            
            # Optimize the actor 
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # print losses
            #if self.total_it % (50 * 50 if self.is_meta else 500 * 50) == 0:
            #    print(f"{self.is_meta} controller;\n\tcritic loss: {critic_loss.item()}\n\tactor loss: {actor_loss.item()}")

            # Target update
            soft_update(self.actor_target, self.actor, self.tau)
            soft_update(self.critic_target, self.critic, 2 * self.tau / 5)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def observe(self, s_t, a_t, s_t1, r_t, done):
        self.memory.store(s_t, a_t, s_t1, r_t, done)

    def random_action(self):
        return torch.tensor([np.random.uniform(-1.,1.,self.nb_actions)], device=device, dtype=torch.float)

    def select_action(self, s_t, warmup, decay_epsilon):
        if warmup:
            return self.random_action()

        with torch.no_grad():
            action = self.actor(s_t).squeeze(0)
            #action += torch.from_numpy(self.is_training * max(self.epsilon, 0) * self.random_process.sample()).to(device).float()
            action += torch.from_numpy(self.is_training * max(self.epsilon, 0) * np.random.uniform(-1.,1.,1)).to(device).float()
            action = torch.clamp(action, -1., 1.)

            action = action.unsqueeze(0)
            
            if decay_epsilon:
                self.epsilon -= self.depsilon
            
            return action

class DQN(nn.Module):
    def __init__(self, inputs, outputs, mem_len = 100000):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(inputs, DEPTH)
        self.fc2 = nn.Linear(DEPTH, DEPTH)
        self.head = nn.Linear(DEPTH, outputs)
        
        self.memory = ReplayMemory(mem_len)

        self.n_actions = outputs
        self.steps_done = 0
        
        self.EPS_START = 1.0
        self.EPS_END = 0.0
        self.EPS_DECAY = 10000 # in number of steps
        self.TAU = 0.001

        self.eps_printed = False

        self.policy_update = 2
        self.tot_updates = 0

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.head(x)
    
    def act(self, state, warmup, is_training):
        if warmup: 
            return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

        if is_training:
            eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * (1. - min(1., self.steps_done / self.EPS_DECAY))
            self.steps_done += 1

            if eps_threshold <= 0.2 and not self.eps_printed:
                self.eps_printed = True
                print("EPS_THRESHOLD below 0.2")

            # With probability eps select a random action
            if random.random() < eps_threshold:
                return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

        # otherwise select action = maxa Q∗(φ(st), a; θ)
        with torch.no_grad():
            return self(state).max(1)[1].view(1, 1)
    
    def experience_replay(self, optimizer, target):
        if len(self.memory) < BATCH_SIZE:
            return

        self.tot_updates += 1
        
        # in the form (state, action) -> (next_state, reward, done)
        transitions = self.memory.sample(BATCH_SIZE)
        batch = transition(*zip(*transitions))
        
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        done_mask = np.array(batch.done)
        not_done_mask = torch.from_numpy(1 - done_mask).float().to(device)
        
        current_Q_values = self(state_batch).gather(1, action_batch)
        # Compute next Q value based on which goal gives max Q values
        # Detach variable from the current graph since we don't want gradients for next Q to propagated
        next_max_q = target(next_state_batch).detach().max(1)[0]
        next_Q_values = not_done_mask * next_max_q
        # Compute the target of the current Q values
        target_Q_values = reward_batch + (GAMMA * next_Q_values)
        # Compute Bellman error (using Huber loss)
        loss = F.smooth_l1_loss(current_Q_values, target_Q_values.unsqueeze(1))
        loss_val = loss.item()

        # Optimize the model
        optimizer.zero_grad()
        loss.backward()
        for param in self.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()

        if self.tot_updates % self.policy_update == 0:
            soft_update(target, self, self.TAU)

        return loss_val

In [11]:
class HIRO(nn.Module):
    def __init__(self, nb_states, nb_actions):
        super(HIRO, self).__init__()
        self.nb_states = nb_states
        self.nb_actions= nb_actions
        self.goal_dim = [0, 1]
        self.goal_dimen = 2
      
        learning_rate = 2.5e-4
        self.meta_controller = DQN(nb_states, 11 * 11).to(device)
        self.meta_controller_optimizer = optim.RMSprop(self.meta_controller.parameters(), lr=learning_rate)
        self.meta_controller_target = DQN(nb_states, 11 * 11, mem_len = 0).to(device)
        self.meta_controller_target.eval()

        self.max_goal_dist = torch.from_numpy(np.array([2., 3.])).to(device)
        self.goal_offset = torch.from_numpy(np.array([0.5, 1.5])).to(device)

        self.controller = TD3(nb_states + len(self.goal_dim), nb_actions).to(device)
        #self.controller.depsilon = 1.0 / 10000

    def teach_controller(self):
        self.controller.update_policy()
    def teach_meta_controller(self):
        self.meta_controller.experience_replay(self.meta_controller_optimizer, self.meta_controller_target)

    def h(self, state, goal, next_state):
        #return goal
        return state[:,self.goal_dim] + goal - next_state[:,self.goal_dim]
    #def intrinsic_reward(self, action, goal):
    #    return torch.tensor(1.0 if self.goal_reached(action, goal) else 0.0, device=device) 
    #def goal_reached(self, action, goal, threshold = 0.1):
    #    return torch.abs(action - goal) <= threshold
    def intrinsic_reward(self, reward, state, goal, next_state):
        #return torch.tensor(2 * reward if self.goal_reached(state, goal, next_state) else reward / 10, device=device) #reward / 2
        # just L2 norm
        return -torch.pow(sum(torch.pow(state.squeeze(0)[self.goal_dim] + goal.squeeze(0) - next_state.squeeze(0)[self.goal_dim], 2)), 0.5)
    def goal_reached(self, state, goal, next_state, threshold = 0.1):
        return torch.pow(sum(torch.pow(state.squeeze(0)[self.goal_dim] + goal.squeeze(0) - next_state.squeeze(0)[self.goal_dim], 2)), 0.5) <= threshold
        #return torch.pow(sum(goal.squeeze(0), 2), 0.5) <= threshold

    def observe_controller(self, s_t, a_t, s_t1, r_t, done):
        self.controller.memory.store(s_t, a_t, s_t1, r_t, done)
    def observe_meta_controller(self, s_t, a_t, s_t1, r_t, done, state_seq, action_seq):
        self.meta_controller.memory.store(s_t, a_t, s_t1, r_t, done)

    def action_to_2D(self, a):
        x = (a % 11)
        y = (a // 11)
        return -1.0 + 0.2 * torch.cat([x, y], axis=1).float()

    def convert_goal(self, a):
        return self.action_to_2D(a) * self.max_goal_dist + self.goal_offset

    def select_goal(self, s_t, warmup, is_training):
        return self.meta_controller.act(s_t, warmup, is_training)
      
    def select_action(self, s_t, g_t, warmup, decay_epsilon):
        sg_t = torch.cat([s_t, g_t], 1).float()
        return self.controller.select_action(sg_t, warmup, decay_epsilon)

In [None]:
import time
SAVE_OFFSET = 1
def train_model():
    global SAVE_OFFSET
    n_observations = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]
    
    agent = HIRO(n_observations, n_actions).to(device)
    
    max_episode_length = 500
    observation = None
    
    warmup = 200
    num_episodes = 10000 # M
    episode_durations = []
    goal_durations = []

    steps = 0
    c = 10

    for i_episode in range(num_episodes):
        observation = env.reset()
        state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
        
        overall_reward = 0
        overall_intrinsic = 0
        episode_steps = 0
        done = False
        goals_done = 0

        while not done:
            goal_raw = agent.select_goal(state, i_episode <= warmup, True)
            goal = agent.convert_goal(goal_raw)
            #goal_durations.append((steps, goal[:,0]))

            state_seq, action_seq = None, None
            first_goal = goal
            goal_done = False
            total_extrinsic = 0

            while not done and not goal_done:
                joint_goal_state = torch.cat([state, goal], axis=1).float()

                # agent pick action ...
                action = agent.select_action(state, goal, i_episode <= warmup, True)
                
                # env response with next_observation, reward, terminate_info
                observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                steps += 1
                next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                next_goal = agent.h(state, goal, next_state)
                joint_next_state = torch.cat([next_state, next_goal], axis=1).float()
                
                if max_episode_length and episode_steps >= max_episode_length -1:
                    done = True
                    
                extrinsic_reward = torch.tensor([reward], device=device)
                intrinsic_reward = agent.intrinsic_reward(reward, state, goal, next_state).unsqueeze(0)
                #intrinsic_reward = agent.intrinsic_reward(action, goal).unsqueeze(0)

                overall_reward += reward
                total_extrinsic += reward
                overall_intrinsic += intrinsic_reward

                goal_reached = agent.goal_reached(state, goal, next_state)
                #goal_done = agent.goal_reached(action, goal)

                # agent observe and update policy
                agent.observe_controller(joint_goal_state, action, joint_next_state, intrinsic_reward, done) #goal_done.item())

                if state_seq is None:
                    state_seq = state
                else:
                    state_seq = torch.cat([state_seq, state])
                if action_seq is None:
                    action_seq = action
                else:
                    action_seq = torch.cat([action_seq, action])

                episode_steps += 1

                if goal_reached:
                    goals_done += 1
                
                if (episode_steps % c) == 0:
                    agent.observe_meta_controller(state_seq[0].unsqueeze(0), goal_raw, next_state, torch.tensor([total_extrinsic], device=device), done,\
                                                  state_seq, action_seq)
                    goal_done = True

                    if i_episode > warmup:
                        agent.teach_meta_controller()

                state = next_state
                goal = next_goal
                
                if i_episode > warmup:
                    agent.teach_controller()

        goal_durations.append((i_episode, overall_intrinsic / episode_steps))
        episode_durations.append((i_episode, overall_reward))
        #plot_durations(episode_durations, goal_durations)

        _, dur = list(map(list, zip(*episode_durations)))
        if len(dur) > 100:
            if i_episode % 100 == 0:
                print(f"{i_episode}: {np.mean(dur[-100:])}")
            if i_episode >= 400 and i_episode % 100 == 0 and np.mean(dur[-100:]) <= -47.0:
                print(f"Unlucky after {i_episode} eps! Terminating...")
                return None
            if np.mean(dur[-100:]) >= 90:
                print(f"Solved after {i_episode} episodes!")
                save_model(agent, f"hiro_{SAVE_OFFSET}")
                SAVE_OFFSET += 1
                return agent

    return None # did not train

In [None]:
state_max = torch.from_numpy(env.observation_space.high).to(device).float()
state_min = torch.from_numpy(env.observation_space.low).to(device).float()
state_mid = (state_max + state_min) / 2.
state_range = (state_max - state_min)
def eval_model(agent, episode_durations, goal_attack, action_attack, same_noise):
    agent.eval()
    agent.meta_controller.eval()
    agent.controller.eval()

    max_episode_length = 500
    agent.meta_controller.is_training = False
    agent.controller.is_training = False

    num_episodes = 100

    c = 10

    for l2norm in np.arange(0.0,0.51,0.05):
        
        overall_reward = 0
        for i_episode in range(num_episodes):
            observation = env.reset()

            state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
            g_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            noise = torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)

            if goal_attack:
                g_state = g_state + state_range * noise
                g_state = torch.max(torch.min(g_state, state_max), state_min).float()
            if action_attack:
                if same_noise:
                    state = state + state_range * noise
                else:
                    state = state + state_range * torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)
                state = torch.max(torch.min(state, state_max), state_min).float()

            episode_steps = 0
            done = False
            while not done:
                # select a goal
                goal_raw = agent.select_goal(g_state, False, False)
                goal = agent.convert_goal(goal_raw)

                goal_done = False
                while not done and not goal_done:
                    action = agent.select_action(state, goal, False, False)
                    observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                    
                    next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                    g_next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

                    noise = torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)
                    if goal_attack:
                        g_next_state = g_next_state + state_range * noise
                        g_next_state = torch.max(torch.min(g_next_state, state_max), state_min).float()
                    if action_attack:
                        if same_noise:
                            next_state = next_state + state_range * noise
                        else:
                            next_state = next_state + state_range * torch.FloatTensor(next_state.shape).uniform_(-l2norm, l2norm).to(device)
                        next_state = torch.max(torch.min(next_state, state_max), state_min).float()

                    next_goal = agent.h(g_state, goal, g_next_state)
                                      
                    overall_reward += reward

                    if max_episode_length and episode_steps >= max_episode_length - 1:
                        done = True
                    episode_steps += 1

                    #goal_done = agent.goal_reached(action, goal)
                    goal_reached = agent.goal_reached(g_state, goal, g_next_state)

                    if (episode_steps % c) == 0:
                        goal_done = True

                    state = next_state
                    g_state = g_next_state
                    goal = next_goal

        episode_durations[np.round(l2norm, 2)].append(overall_reward / num_episodes)

In [12]:
state_max = torch.from_numpy(env.observation_space.high).to(device).float()
state_min = torch.from_numpy(env.observation_space.low).to(device).float()
state_mid = (state_max + state_min) / 2.
state_range = (state_max - state_min)
def fgsm_attack(data, eps, data_grad):
    sign_data_grad = data_grad.sign()

    perturbed_data = data + eps * sign_data_grad * state_range

    clipped_perturbed_data = torch.max(torch.min(perturbed_data, state_max), state_min)

    return clipped_perturbed_data

def fgsm_goal(g_state, agent, eps, target, targeted):
    g_state = g_state.clone().detach().requires_grad_(True)

    # initial forward pass
    goal = agent.meta_controller(g_state)

    if targeted:
        loss = F.mse_loss(goal, target)
    else:
        loss = F.mse_loss(goal, -goal) # doing -goal inverts the argmax 

    agent.meta_controller.zero_grad()

    # calc loss
    loss.backward()
    data_grad = g_state.grad.data

    # perturb state
    g_state_p = fgsm_attack(g_state, eps, data_grad).float()
    return g_state_p

def fgsm_action(state, goal, agent, eps, target, targeted):
    #state = torch.tensor(state, requires_grad=True)
    state = state.clone().detach().requires_grad_(True)
    goal = goal.clone().detach()

    sg_t = torch.cat([state, goal], 1).float()

    if targeted:
        # initial forward pass
        action = agent.controller.actor(sg_t)
        action = torch.clamp(action, -1., 1.)

        loss = F.mse_loss(action, target)
    else:
        loss = agent.controller.critic.Q1(sg_t, agent.controller.actor(sg_t)).mean()

    agent.controller.actor.zero_grad()

    # calc loss
    loss.backward()
    data_grad = state.grad.data
    # perturb state
    state_p = fgsm_attack(state, eps, data_grad).float()
    return state_p

def apply_fgsm(agent, episode_durations, goal_attack, targeted):
    TARGET_GOAL = torch.tensor([[0.0] * (11 * 11)], device=device, dtype=torch.float)
    TARGET_ACTION = torch.tensor([[0.0, 0.0]], device=device, dtype=torch.float)

    agent.eval()
    agent.meta_controller.eval()
    agent.controller.eval()

    max_episode_length = 500
    agent.meta_controller.is_training = False
    agent.controller.is_training = False

    num_episodes = 100

    c = 10

    for eps in np.arange(0.0, 0.201, 0.02):

        overall_reward = 0
        for i_episode in range(num_episodes):
            observation = env.reset()

            og_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            if goal_attack: # target meta controller
                state = fgsm_goal(og_state, agent, eps, TARGET_GOAL, targeted)
            else: # target controller
                goal_raw = agent.select_goal(og_state, False, False)
                goal = agent.convert_goal(goal_raw)
                state = fgsm_action(og_state, goal, agent, eps, TARGET_ACTION, targeted)

            episode_steps = 0
            done = False
            while not done:
                goal_raw = agent.select_goal(state, False, False)
                goal = agent.convert_goal(goal_raw)

                goal_done = False
                while not done and not goal_done:
                    action = agent.select_action(state, goal, False, False)
                    
                    observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())

                    next_og_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                    if goal_attack: # target meta controller
                        next_state = fgsm_goal(next_og_state, agent, eps, TARGET_GOAL, targeted)
                    else: # target controller
                        goal_temp = agent.h(state, goal, next_og_state)
                        next_state = fgsm_action(next_og_state, goal_temp, agent, eps, TARGET_ACTION, targeted)

                    next_goal = agent.h(state, goal, next_state)
                                      
                    overall_reward += reward

                    if max_episode_length and episode_steps >= max_episode_length - 1:
                        done = True
                    episode_steps += 1

                    #goal_done = agent.goal_reached(action, goal)
                    goal_reached = agent.goal_reached(state, goal, next_state)

                    if (episode_steps % c) == 0:
                        goal_done = True

                    state = next_state
                    goal = next_goal

        episode_durations[eps].append(overall_reward / num_episodes)

In [13]:
noise_hrl = {'both': {}, 'action_only': {}, 'goal_only': {}, 'both_same': {}}
for l2norm in np.arange(0,0.51,0.05):
    for i in [noise_hrl['both'], noise_hrl['action_only'], noise_hrl['goal_only'], noise_hrl['both_same']]:
        i[np.round(l2norm, 2)] = []

targeted = {'goal': {}, 'action': {}}
untargeted = {'goal': {}, 'action': {}}
for eps in np.arange(0.0, 0.201, 0.02):
    for x in ['goal', 'action']:
        targeted[x][eps] = []
        untargeted[x][eps] = []

n_observations = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

"""
i = 1
while i < 7:
    agent = train_model()
    #agent = HIRO(n_observations, n_actions).to(device)
    #load_model(agent, f"hiro_fall_{i}")

    if agent is not None:
        # goal_attack, action_attack, same_noise
        eval_model(agent, noise_hrl['both_same'], True, True, True)
        eval_model(agent, noise_hrl['both'], True, True, False)
        eval_model(agent, noise_hrl['action_only'], False, True, False)
        eval_model(agent, noise_hrl['goal_only'], True, False, False)
        print(f"{i} noise_hrl: {noise_hrl}")
        i += 1

print("----")
print(f"noise_hrl: {noise_hrl}")
"""

i = 0
while i < 6:
    if i == 3:
        i += 1
        continue
    #agent = train_model()
    agent = HIRO(n_observations, n_actions).to(device)
    load_model(agent, f"hiro_{i}")

    if agent is not None:
        apply_fgsm(agent, untargeted['action'], False, False)   
        apply_fgsm(agent, untargeted['goal'], True, False)  
        print(f"{i} fgsm (ut): {untargeted}")

        apply_fgsm(agent, targeted['goal'], True, True)
        apply_fgsm(agent, targeted['action'], False, True)   
        print(f"{i} fgsm (t): {targeted}")
        i += 1

print("----")
print(f"fgsm (ut): {untargeted}")
print(f"fgsm (t): {targeted}")

0 fgsm (ut): {'goal': {0.0: [94.33399999999672], 0.02: [79.13499999998595], 0.04: [3.34499999999847], 0.06: [-20.207999999985166], 0.08: [-24.75999999997292], 0.1: [-9.449999999988833], 0.12: [-1.5960000000028882], 0.14: [13.235000000000966], 0.16: [-12.7359999999976], 0.18: [-28.982999999974364], 0.2: [-22.20099999998438]}, 'action': {0.0: [97.37799999999562], 0.02: [75.39899999998914], 0.04: [90.94099999998551], 0.06: [96.4169999999944], 0.08: [-32.06299999996815], 0.1: [-41.24899999996838], 0.12: [-41.41499999997353], 0.14: [-37.329999999970184], 0.16: [-39.720999999970395], 0.18: [-23.177999999980006], 0.2: [-43.93699999998057]}}
0 fgsm (t): {'goal': {0.0: [97.76199999999584], 0.02: [85.28699999999209], 0.04: [6.8179999999969985], 0.06: [-26.58499999998092], 0.08: [-20.281999999983665], 0.1: [-13.813999999997026], 0.12: [-15.303999999994817], 0.14: [15.617000000010513], 0.16: [-5.809000000006777], 0.18: [-14.816999999993351], 0.2: [-28.074999999976303]}, 'action': {0.0: [93.1429999

In [None]:
Solved after 2564 episodes!
0 noise_hrl: {'both': {0.0: [94.51999999999411], 0.05: [77.73399999999089], 0.1: [62.49999999999135], 0.15: [64.97399999999148], 0.2: [48.31499999999807], 0.25: [12.52299999999834], 0.3: [-0.6109999999991379], 0.35: [-13.03800000000245], 0.4: [-19.677999999987172], 0.45: [-22.515999999976557], 0.5: [-23.790999999986585]}, 'action_only': {0.0: [96.03099999999391], 0.05: [77.5119999999846], 0.1: [73.84899999997789], 0.15: [54.39599999998985], 0.2: [44.23100000000369], 0.25: [32.61500000002243], 0.3: [2.1159999999987322], 0.35: [-14.945000000000137], 0.4: [-27.69299999997554], 0.45: [-25.040999999978858], 0.5: [-27.306999999979066]}, 'goal_only': {0.0: [97.48999999999606], 0.05: [91.30599999999794], 0.1: [81.75099999999196], 0.15: [71.89499999997756], 0.2: [81.89899999998096], 0.25: [76.84999999998966], 0.3: [77.25799999998907], 0.35: [73.86299999997289], 0.4: [74.97599999998678], 0.45: [72.03799999997823], 0.5: [69.99199999998356]}, 'both_same': {0.0: [95.85899999999512], 0.05: [69.76699999998402], 0.1: [55.15299999999188], 0.15: [48.93500000000657], 0.2: [31.406000000013943], 0.25: [13.855999999995424], 0.3: [15.421000000000479], 0.35: [7.552000000005685], 0.4: [-10.321000000004597], 0.45: [-19.07799999998664], 0.5: [-13.299999999990808]}}
6 noise_hrl: {'both': {0.0: [87.86099999998967, 94.68799999999457, -0.7399999999995339, 97.85799999999603, 96.187999999994, 40.563000000018434], 0.05: [18.52800000000326, 85.08399999998132, 3.9380000000035262, 85.43899999998497, 56.00399999998277, 75.66299999997736], 0.1: [8.384999999997586, 79.18999999998522, -4.991000000002036, 73.28199999998044, 52.31899999998814, 52.647999999994035], 0.15: [2.2269999999981445, 78.49499999998652, 10.846999999995345, 62.248999999976874, 52.656999999981544, 24.558000000016197], 0.2: [-8.079000000006069, 81.55699999998232, -7.719999999999308, 53.37499999999752, 47.77500000000174, 13.27000000000003], 0.25: [-29.808999999977168, 65.760999999974, -13.81799999999499, 28.01900000001038, 32.07700000002334, 11.439000000009614], 0.3: [-26.73699999997784, 45.11800000001087, -29.160999999976315, 12.801000000005542, 27.57900000001563, 8.894999999993901], 0.35: [-31.33599999997336, 10.566999999991904, -33.420999999973475, 1.9070000000003509, 20.259000000014645, 10.676000000005661], 0.4: [-35.671999999969586, 2.198999999995465, -32.81299999997248, 0.3749999999995863, 6.655999999996588, -4.912999999999815], 0.45: [-37.993999999971855, -0.8820000000009628, -36.8719999999737, -16.232999999983637, 13.763000000003421, -3.557000000003212], 0.5: [-41.08099999996774, -0.8850000000001097, -26.623999999973506, -18.87699999998581, -4.220000000005752, -14.017000000000246]}, 'action_only': {0.0: [92.34099999998766, 94.68999999999454, 1.9210000000001626, 97.86199999999606, 97.66399999999571, 37.643000000021466], 0.05: [10.818999999998814, 83.88699999998595, 4.689999999996502, 93.28199999999353, 57.04399999997311, 75.21099999998277], 0.1: [11.324000000007302, 79.42199999998529, 5.91499999999706, 90.39899999998755, 61.019999999989295, 43.70300000001355], 0.15: [7.674999999993611, 73.54199999998376, 4.018999999995917, 65.47399999997893, 48.516999999992976, 25.56000000001888], 0.2: [-2.761000000001606, 76.35499999997819, 17.087000000018623, 39.38200000001894, 52.45199999999775, 10.282999999994331], 0.25: [-17.065999999985884, 69.30099999998907, -3.8460000000022068, 5.648999999998402, 26.549000000013567, 3.9279999999953827], 0.3: [-14.445999999990159, 44.63800000001339, -14.41799999998965, -7.584000000005559, 5.600999999999053, -16.609999999998426], 0.35: [-34.75299999997355, 26.645000000014583, -23.104999999978126, -24.01899999997655, 5.521999999996938, -19.320999999989727], 0.4: [-43.2259999999774, 6.745999999993341, -37.65999999997381, -24.269999999979053, -21.434999999990758, -20.17099999998139], 0.45: [-40.476999999971184, 2.9919999999979634, -41.51099999997138, -22.635999999977116, -31.249999999979313, -29.529999999974816], 0.5: [-44.65999999998488, -8.47800000000474, -32.58699999997404, -28.638999999971407, -27.55699999997569, -32.73199999997101]}, 'goal_only': {0.0: [92.57099999998736, 97.63699999999564, 3.830999999998346, 97.85799999999607, 97.67399999999571, 37.471000000023814], 0.05: [89.137999999989, 96.1629999999963, 20.71400000000297, 89.68599999998553, 97.68699999999579, 66.9649999999773], 0.1: [86.4759999999864, 96.08899999999618, 50.04199999999733, 88.99399999999258, 97.61999999999561, 72.17499999997892], 0.15: [68.9559999999841, 94.50599999999659, 39.52300000001755, 93.9059999999913, 91.68299999999104, 67.66399999997721], 0.2: [51.01599999999133, 97.09799999999494, 53.64999999999424, 96.0809999999939, 85.59399999999269, 59.779999999978514], 0.25: [34.939000000014445, 89.48199999999594, 45.37100000000683, 96.95699999999391, 80.26899999998587, 58.64900000000308], 0.3: [40.4490000000168, 91.4759999999881, 40.285000000009305, 93.58799999998955, 66.79399999997987, 66.85199999999126], 0.35: [32.03000000001023, 96.23199999999356, 50.15900000000849, 93.81999999998936, 80.16999999998733, 79.13199999998966], 0.4: [49.59699999998716, 96.07899999999302, 43.88200000000962, 94.16599999999129, 82.3799999999879, 83.93599999998241], 0.45: [45.65700000000616, 95.39599999999102, 51.323999999994605, 93.47799999998813, 82.38399999998292, 79.6169999999791], 0.5: [45.86500000000504, 95.1909999999909, 48.0640000000039, 90.40899999998601, 87.68699999998445, 73.87399999997689]}, 'both_same': {0.0: [92.49999999998747, 96.1499999999939, 5.644999999995011, 97.85999999999603, 97.66499999999569, 25.14800000001758], 0.05: [36.53200000001549, 87.51499999999245, 12.757000000001488, 91.37199999998839, 53.85700000000113, 73.98599999997374], 0.1: [11.3809999999961, 84.79299999998233, 26.19600000001604, 76.96799999997437, 63.12599999998738, 53.82400000000198], 0.15: [-0.296000000001824, 78.49899999998844, 13.524000000003799, 62.940999999978, 49.22899999999972, 30.89500000001775], 0.2: [-20.264999999992277, 81.39899999998056, -7.880000000003701, 49.380999999997414, 48.41600000000588, 12.589000000006038], 0.25: [-26.395999999981893, 65.74799999998028, -12.696999999994969, 46.444999999999276, 30.243000000013318, 17.603000000009658], 0.3: [-28.698999999971665, 51.077999999995754, -12.946999999997281, 23.95900000001548, 24.562000000020248, 3.2049999999963346], 0.35: [-30.66799999997565, 36.45400000001939, -25.814999999977413, 23.235000000017063, 20.59900000001025, -12.154999999994521], 0.4: [-27.3279999999785, 12.342999999997224, -29.078999999972797, -1.5610000000018076, 23.22500000001262, -4.952000000004371], 0.45: [-31.828999999972343, 4.2679999999944656, -34.478999999969574, -5.965000000006495, 15.30500000000193, -13.189999999997621], 0.5: [-33.30299999997497, -16.213999999988467, -18.493999999987388, 4.618999999995867, 4.9009999999967935, -15.755999999992827]}}

In [None]:
def eval_scale(agent, episode_durations):
    agent.eval()
    agent.meta_controller.eval()
    agent.controller.eval()

    max_episode_length = 500
    agent.meta_controller.is_training = False
    agent.controller.is_training = False

    num_episodes = 100

    c = 10

    for scale in np.arange(1.0,7.01,0.5):
        env = NormalizedEnv(PointFallEnv(scale))

        overall_reward = 0
        for i_episode in range(num_episodes):
            observation = env.reset()

            state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
            g_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            episode_steps = 0
            done = False
            while not done:
                # select a goal
                goal_raw = agent.select_goal(g_state, False, False)
                goal = agent.convert_goal(goal_raw)

                goal_done = False
                while not done and not goal_done:
                    action = agent.select_action(state, goal, False, False)
                    observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                    
                    next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                    g_next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

                    next_goal = agent.h(g_state, goal, g_next_state)
                                      
                    overall_reward += reward

                    if max_episode_length and episode_steps >= max_episode_length - 1:
                        done = True
                    episode_steps += 1

                    #goal_done = agent.goal_reached(action, goal)
                    goal_reached = agent.goal_reached(g_state, goal, g_next_state)

                    if (episode_steps % c) == 0:
                        goal_done = True

                    state = next_state
                    g_state = g_next_state
                    goal = next_goal

        episode_durations[np.round(scale, 2)].append(overall_reward / num_episodes)

In [None]:
episodes = {}
for scale in np.arange(1.0,7.01,0.5):
    episodes[np.round(scale, 2)] = []

n_observations = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

i = 0
while i < 6:
    if i == 3:
        i += 1
        continue
    #agent = train_model()
    agent = HIRO(n_observations, n_actions).to(device)
    load_model(agent, f"hiro_{i}")

    if agent is not None:
        # goal_attack, action_attack, same_noise
        eval_scale(agent, episodes)
        print(f"{i} scale: {episodes}")
        i += 1

print("----")
print(f"scale: {episodes}")

0 scale: {1.0: [66.95999999999025], 1.5: [55.811999999996836], 2.0: [98.59399999999691], 2.5: [92.1249999999889], 3.0: [95.14899999999675], 3.5: [97.99699999999622], 4.0: [97.6919999999959], 4.5: [96.70099999999444], 5.0: [94.73899999999036], 5.5: [96.84599999999456], 6.0: [96.70699999999422], 6.5: [95.90199999999327], 7.0: [95.38299999999188]}
1 scale: {1.0: [66.95999999999025, 83.9039999999899], 1.5: [55.811999999996836, 7.234999999996528], 2.0: [98.59399999999691, 79.95899999998268], 2.5: [92.1249999999889, 92.03499999999286], 3.0: [95.14899999999675, 98.08899999999632], 3.5: [97.99699999999622, 94.52299999999197], 4.0: [97.6919999999959, 92.5209999999868], 4.5: [96.70099999999444, 91.94599999998684], 5.0: [94.73899999999036, 91.08099999998532], 5.5: [96.84599999999456, 86.04699999998392], 6.0: [96.70699999999422, 59.848999999979796], 6.5: [95.90199999999327, 55.799999999985815], 7.0: [95.38299999999188, 44.612999999998074]}
2 scale: {1.0: [66.95999999999025, 83.9039999999899, -11.3

In [None]:
def eval_starting_position(agent, episode_durations):
    agent.eval()
    agent.meta_controller.eval()
    agent.controller.eval()

    max_episode_length = 500
    agent.meta_controller.is_training = False
    agent.controller.is_training = False

    num_episodes = 100

    c = 10

    for extra_range in np.arange(0.0, 0.401, 0.05):
        
        overall_reward = 0
        for i_episode in range(num_episodes):
            observation = env.reset()

            extra = np.random.uniform(-0.1 - extra_range, 0.1 + extra_range, env.starting_point.shape)
            #extra = np.random.uniform(0.1, 0.1 + extra_range, env.starting_point.shape)
            #extra = extra * (2*np.random.randint(0,2,size=env.starting_point.shape)-1)
            env.unwrapped.state = np.array(env.starting_point + extra, dtype=np.float32)
            env.unwrapped.state[2] += math.pi / 2. # start facing up
            env.unwrapped.state[2] = env.state[2] % (2 * math.pi)
            observation = env.normalised_state()

            state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
            g_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            episode_steps = 0
            done = False
            while not done:
                # select a goal
                goal_raw = agent.select_goal(g_state, False, False)
                goal = agent.convert_goal(goal_raw)

                goal_done = False
                while not done and not goal_done:
                    action = agent.select_action(state, goal, False, False)
                    observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                    
                    next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                    g_next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

                    next_goal = agent.h(g_state, goal, g_next_state)
                                      
                    overall_reward += reward

                    if max_episode_length and episode_steps >= max_episode_length - 1:
                        done = True
                    episode_steps += 1

                    #goal_done = agent.goal_reached(action, goal)
                    goal_reached = agent.goal_reached(g_state, goal, g_next_state)

                    if (episode_steps % c) == 0:
                        goal_done = True

                    state = next_state
                    g_state = g_next_state
                    goal = next_goal

        episode_durations[np.round(extra_range, 3)].append(overall_reward / num_episodes)

In [None]:
episodes = {}
for extra_range in np.arange(0.0, 0.401, 0.05):
    episodes[np.round(extra_range, 3)] = []

n_observations = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

i = 0
while i < 6:
    if i == 3:
        i += 1
        continue
    #agent = train_model()
    agent = HIRO(n_observations, n_actions).to(device)
    load_model(agent, f"hiro_{i}")

    if agent is not None:
        # goal_attack, action_attack, same_noise
        eval_starting_position(agent, episodes)
        print(f"{i} range: {episodes}")
        i += 1

print("----")
print(f"range: {episodes}")

0 range: {0.0: [97.39399999999534], 0.05: [91.5309999999883], 0.1: [84.64699999998994], 0.15: [72.26799999998433], 0.2: [77.86399999998682], 0.25: [54.931999999983994], 0.3: [63.21199999998873], 0.35: [63.46400000000179], 0.4: [47.96500000000046]}
1 range: {0.0: [97.39399999999534, 92.41799999998757], 0.05: [91.5309999999883, 94.19399999998937], 0.1: [84.64699999998994, 83.77399999998704], 0.15: [72.26799999998433, 79.78199999998208], 0.2: [77.86399999998682, 77.18399999998209], 0.25: [54.931999999983994, 53.83399999998829], 0.3: [63.21199999998873, 46.61000000000517], 0.35: [63.46400000000179, 32.51000000001554], 0.4: [47.96500000000046, 27.219000000018426]}
2 range: {0.0: [97.39399999999534, 92.41799999998757, 93.21299999999279], 0.05: [91.5309999999883, 94.19399999998937, 90.24999999998934], 0.1: [84.64699999998994, 83.77399999998704, 93.18799999999278], 0.15: [72.26799999998433, 79.78199999998208, 88.74199999999351], 0.2: [77.86399999998682, 77.18399999998209, 88.24199999999252], 0

In [None]:
state_max = torch.from_numpy(env.observation_space.high).to(device).float()
state_min = torch.from_numpy(env.observation_space.low).to(device).float()
state_mid = (state_max + state_min) / 2.
state_range = (state_max - state_min)
def save_trajectories(agent, episode_durations, dirty):
    agent.eval()
    agent.meta_controller.eval()
    agent.controller.eval()

    max_episode_length = 500
    agent.meta_controller.is_training = False
    agent.controller.is_training = False

    num_episodes = 10

    c = 10

    l2norm = 0.3
    episode_durations.append([])
    
    for i_episode in range(num_episodes):
        path = {"overall_reward": 0, "manager": [], "worker": []}

        observation = env.reset()

        state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
        state_ = torch.from_numpy(observation).float().unsqueeze(0).to(device)
        g_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
        g_state_ = torch.from_numpy(observation).float().unsqueeze(0).to(device)

        noise = torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)

        if dirty:
            g_state = g_state + state_range * noise
            g_state = torch.max(torch.min(g_state, state_max), state_min).float()
        if dirty:
            state = state + state_range * torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)
            state = torch.max(torch.min(state, state_max), state_min).float()

        episode_steps = 0
        overall_reward = 0
        done = False
        while not done:
            # select a goal
            goal_raw = agent.select_goal(g_state, False, False)
            goal = agent.convert_goal(goal_raw)
            path["manager"].append((episode_steps, g_state_.detach().cpu().squeeze(0).numpy(), goal.detach().cpu().squeeze(0).numpy()))

            goal_done = False
            while not done and not goal_done:
                action = agent.select_action(state, goal, False, False)
                path["worker"].append((episode_steps, torch.cat([state_, goal], 1).detach().cpu().squeeze(0).numpy(), action.detach().cpu().squeeze(0).numpy()))
                observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                
                next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                g_next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                state_ = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                g_state_ = torch.from_numpy(observation).float().unsqueeze(0).to(device)

                noise = torch.FloatTensor(state.shape).uniform_(-l2norm, l2norm).to(device)
                if dirty:
                    g_next_state = g_next_state + state_range * noise
                    g_next_state = torch.max(torch.min(g_next_state, state_max), state_min).float()
                if dirty:
                    next_state = next_state + state_range * torch.FloatTensor(next_state.shape).uniform_(-l2norm, l2norm).to(device)
                    next_state = torch.max(torch.min(next_state, state_max), state_min).float()

                next_goal = agent.h(g_state, goal, g_next_state)
                                  
                overall_reward += reward

                if max_episode_length and episode_steps >= max_episode_length - 1:
                    done = True
                episode_steps += 1

                #goal_done = agent.goal_reached(action, goal)
                goal_reached = agent.goal_reached(g_state, goal, g_next_state)

                if (episode_steps % c) == 0:
                    goal_done = True

                state = next_state
                g_state = g_next_state
                goal = next_goal

        path["overall_reward"] = overall_reward
        episode_durations[-1].append(path)

In [None]:
episodes = []

n_observations = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

i = 0
while i < 10:
    #agent = train_model()
    agent = HIRO(n_observations, n_actions).to(device)
    load_model(agent, f"hiro_{i}")

    if agent is not None:
        # goal_attack, action_attack, same_noise
        save_trajectories(agent, episodes, True)
        #print(f"{i} paths: {episodes}")
        i += 1

print("----")
#print(f"paths: {episodes}")

torch.save(episodes, "PointFall_dirty_eps.pt")

----
