In [1]:
from google.colab import drive
drive.mount('/content/drive/')

#!cp "/content/drive/My Drive/Dissertation/preprocessing.py" .
#!cp -r "/content/drive/My Drive/Dissertation/gym_maze" .
#!cp -r "/content/drive/My Drive/Dissertation/envs" .

Mounted at /content/drive/


In [2]:
%%javascript
function ClickConnect(){
console.log("Working");
document.querySelector("colab-toolbar-button#connect").click()
}setInterval(ClickConnect,60000)

<IPython.core.display.Javascript object>

In [2]:
# for inference, not continued training
def save_model(model, name):
    path = f"/content/drive/My Drive/Dissertation/saved_models/{name}" 

    torch.save({
      'meta_controller': {
          'critic': model.meta_controller.critic.state_dict(),
          'critic_opt': model.meta_controller.critic_optimizer.state_dict(),
          'actor': model.meta_controller.actor.state_dict(),
          'actor_opt': model.meta_controller.actor_optimizer.state_dict(),
      },
      'controller': {
          'critic': model.controller.critic.state_dict(),
          'critic_opt': model.controller.critic_optimizer.state_dict(),
          'actor': model.controller.actor.state_dict(),
          'actor_opt': model.controller.actor_optimizer.state_dict(),
      }
    }, path)

import copy
def load_model(model, name):
    path = f"/content/drive/My Drive/Dissertation/saved_models/{name}" 
    checkpoint = torch.load(path)

    model.meta_controller.critic.load_state_dict(checkpoint['meta_controller']['critic'])
    model.meta_controller.critic_optimizer.load_state_dict(checkpoint['meta_controller']['critic_opt'])
    model.meta_controller.critic_target = copy.deepcopy(model.meta_controller.critic)
    model.meta_controller.actor.load_state_dict(checkpoint['meta_controller']['actor'])
    model.meta_controller.actor_optimizer.load_state_dict(checkpoint['meta_controller']['actor_opt'])
    model.meta_controller.actor_target = copy.deepcopy(model.meta_controller.actor)

    model.controller.critic.load_state_dict(checkpoint['controller']['critic'])
    model.controller.critic_optimizer.load_state_dict(checkpoint['controller']['critic_opt'])
    model.controller.critic_target = copy.deepcopy(model.controller.critic)
    model.controller.actor.load_state_dict(checkpoint['controller']['actor'])
    model.controller.actor_optimizer.load_state_dict(checkpoint['controller']['actor_opt'])
    model.controller.actor_target = copy.deepcopy(model.controller.actor)

    # model.eval() for evaluation instead
    model.eval()
    model.meta_controller.eval()
    model.controller.eval()

In [3]:
%matplotlib inline

import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from IPython import display
plt.ion()

# if gpu is to be used
device = torch.device("cuda")

In [4]:
class NormalizedEnv(gym.ActionWrapper):
    """ Wrap action """

    def action(self, action):
        act_k = (self.action_space.high - self.action_space.low)/ 2.
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k * action + act_b

    def reverse_action(self, action):
        act_k_inv = 2./(self.action_space.high - self.action_space.low)
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k_inv * (action - act_b)

In [5]:
#!pip install gym[box2d]

In [6]:
env = NormalizedEnv(gym.make("MountainCarContinuous-v0"))

***

In [7]:
def plot_durations(episode_durations, goals_done):
    fig, axs = plt.subplots(2, figsize=(10,10))
    
    durations_t, durations = list(map(list, zip(*episode_durations)))
    durations = torch.tensor(durations, dtype=torch.float)
    
    fig.suptitle('Training')
    axs[0].set_xlabel('Episode')
    axs[0].set_ylabel('Reward')
    
    axs[0].plot(durations_t, durations.numpy())

    durations_t, durations = list(map(list, zip(*goals_done)))
    durations = torch.tensor(durations, dtype=torch.float)
    
    fig.suptitle('Training')
    axs[1].set_xlabel('Episode')
    axs[1].set_ylabel('Goals done')
    
    axs[1].plot(durations_t, durations.numpy())
        
    plt.pause(0.001)  # pause a bit so that plots are updated
    display.clear_output(wait=True)

In [8]:
# [reference] https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py

class RandomProcess(object):
    def reset_states(self):
        pass

class AnnealedGaussianProcess(RandomProcess):
    def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
        self.mu = mu
        self.sigma = sigma
        self.n_steps = 0

        if sigma_min is not None:
            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
            self.c = sigma
            self.sigma_min = sigma_min
        else:
            self.m = 0.
            self.c = sigma
            self.sigma_min = sigma

    @property
    def current_sigma(self):
        sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
        return sigma


# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
    def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
        super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
        self.theta = theta
        self.mu = mu
        self.dt = dt
        self.x0 = x0
        self.size = size
        self.reset_states()

    def sample(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
        self.x_prev = x
        self.n_steps += 1
        return x

    def reset_states(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)

In [9]:
def soft_update(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - tau) + param.data * tau
        )

def hard_update(target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)

In [10]:
# (state, action) -> (next_state, reward, done)
transition = namedtuple('transition', ('state', 'action', 'next_state', 'reward', 'done'))

# replay memory D with capacity N
class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    # implemented as a cyclical queue
    def store(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        
        self.memory[self.position] = transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

***

In [11]:
class Actor(nn.Module):
    def __init__(self, nb_states, nb_actions):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(nb_states, 256)
        self.fc2 = nn.Linear(256, 256)
        self.head = nn.Linear(256, nb_actions)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.head(x))

class Critic(nn.Module):
    def __init__(self, nb_states, nb_actions):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(nb_states + nb_actions, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 1)

        # Q2 architecture
        self.l4 = nn.Linear(nb_states + nb_actions, 256)
        self.l5 = nn.Linear(256, 256)
        self.l6 = nn.Linear(256, 1)
    
    def forward(self, state, action):
        sa = torch.cat([state, action], 1).float()

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2

    def Q1(self, state, action):
        sa = torch.cat([state, action], 1).float()

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

In [12]:
BATCH_SIZE = 64
GAMMA = 0.99

# https://spinningup.openai.com/en/latest/algorithms/td3.html
class TD3(nn.Module):
    def __init__(self, nb_states, nb_actions):
        super(TD3, self).__init__()
        self.nb_states = nb_states
        self.nb_actions= nb_actions
        
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optimizer  = optim.Adam(self.actor.parameters(), lr=0.0001)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optimizer  = optim.Adam(self.critic.parameters(), lr=0.0001)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = ReplayMemory(2000000)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.15, mu=0.0, sigma=0.2)

        # Hyper-parameters
        self.tau = 0.005
        self.depsilon = 1.0 / 50000
        self.policy_noise=0.2
        self.noise_clip=0.5
        self.policy_freq=2
        self.total_it = 0

        # 
        self.epsilon = 1.0
        self.is_training = True

    def update_policy(self):
        if len(self.memory) < BATCH_SIZE:
            return

        self.total_it += 1
        
        # in the form (state, action) -> (next_state, reward, done)
        transitions = self.memory.sample(BATCH_SIZE)
        batch = transition(*zip(*transitions))
        
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        done_mask = np.array(batch.done)
        not_done_mask = torch.from_numpy(1 - done_mask).float().to(device)

        # Target Policy Smoothing
        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (
                torch.randn_like(action_batch) * self.policy_noise
            ).clamp(-self.noise_clip, self.noise_clip).float()
            
            next_action = (
                self.actor_target(next_state_batch) + noise
            ).clamp(-1.0, 1.0).float()

            # Compute the target Q value
            # Clipped Double-Q Learning
            target_Q1, target_Q2 = self.critic_target(next_state_batch, next_action)
            target_Q = torch.min(target_Q1, target_Q2).squeeze(1)
            target_Q = (reward_batch + GAMMA * not_done_mask  * target_Q).float()
        
        # Critic update
        current_Q1, current_Q2 = self.critic(state_batch, action_batch)
      
        critic_loss = F.mse_loss(current_Q1, target_Q.unsqueeze(1)) + F.mse_loss(current_Q2, target_Q.unsqueeze(1))

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:
            # Compute actor loss
            actor_loss = -self.critic.Q1(state_batch, self.actor(state_batch)).mean()
            
            # Optimize the actor 
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Target update
            soft_update(self.actor_target, self.actor, self.tau)
            soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def observe(self, s_t, a_t, s_t1, r_t, done):
        self.memory.store(s_t, a_t, s_t1, r_t, done)

    def random_action(self):
        return torch.tensor([np.random.uniform(-1.,1.,self.nb_actions)], device=device, dtype=torch.float)

    def select_action(self, s_t, warmup=True, decay_epsilon=True):
        if warmup:
            return self.random_action()

        with torch.no_grad():
            action = self.actor(s_t).squeeze(0)
            #action += torch.from_numpy(self.is_training * max(self.epsilon, 0) * self.random_process.sample()).to(device).float()
            action += torch.from_numpy(self.is_training * max(self.epsilon, 0) * np.random.uniform(-1.,1.,1)).to(device).float()
            action = torch.clamp(action, -1., 1.)

            action = action.unsqueeze(0)
            
            if decay_epsilon:
                self.epsilon -= self.depsilon
            
            return action

In [13]:
class HIRO(nn.Module):
    def __init__(self, nb_states, nb_actions):
        super(HIRO, self).__init__()
        self.nb_states = nb_states
        self.nb_actions= nb_actions
        self.goal_dim = [0] * nb_actions
      
        self.meta_controller = TD3(nb_states, len(self.goal_dim)).to(device)
        self.max_goal_dist = torch.from_numpy(np.array([1.0] * nb_actions)).to(device)
        self.meta_controller.depsilon = 1.0 / 500000

        self.controller = TD3(nb_states + len(self.goal_dim), nb_actions).to(device)
        self.controller.depsilon = 1.0 / 500000

    def teach_controller(self):
        self.controller.update_policy()
    def teach_meta_controller(self):
        self.meta_controller.update_policy()

    def h(self, state, goal, next_state):
        return goal
        #return state[:,self.goal_dim] + goal - next_state[:,self.goal_dim]
    def intrinsic_reward(self, action, goal):
        return torch.tensor(1.0 if self.goal_reached(action, goal) else 0.0, device=device) 
    def goal_reached(self, action, goal, threshold = 0.1):
        return torch.all(torch.abs(action - goal) <= threshold)
    #def intrinsic_reward(self, state, goal, next_state):
    #    return torch.tensor(1.0 if self.goal_reached(state, goal, next_state) else 0.0, device=device)
    #    # just L2 norm
    #    #return -torch.pow(sum(torch.pow(state.squeeze(0)[self.goal_dim] + goal.squeeze(0) - next_state.squeeze(0)[self.goal_dim], 2)), 0.5)
    #def goal_reached(self, state, goal, next_state, threshold = 0.1):
    #    return torch.abs(next_state[:,self.goal_dim] - state[:,self.goal_dim] - goal) <= threshold
    #    #return torch.pow(sum(goal.squeeze(0), 2), 0.5) <= threshold

    # correct goals to allow for use in experience replay
    def off_policy_correction(self, action_seq, state_seq, goal, next_state):
        return goal # no off-policy correction
        action_seq = torch.stack(action_seq).to(device)
        state_seq = torch.stack(state_seq).to(device)

        mean = (next_state - state_seq[0])[:,self.goal_dim]
        std = 0.5 * (0.5 * self.max_goal_dist)

        candidates = [\
            torch.min(\
                torch.max(torch.from_numpy(np.random.normal(loc=mean.cpu(), scale=std.cpu(), size=len(self.goal_dim)).astype(np.float32)).to(device).unsqueeze(0), -self.max_goal_dist),\
                self.max_goal_dist) for _ in range(8)\
            ]
        candidates.append(mean)
        candidates.append(goal)
        candidates = torch.stack(candidates).to(device)

        surr_prob = [\
              -F.mse_loss(action_seq, self.controller.actor_target(torch.cat([state_seq, state_seq[0][:,self.goal_dim] + candidates[0] - state_seq[:,:,self.goal_dim]], 2).float()))\
              for candidate in candidates]
        index = int(np.argmax(surr_prob))
        goal_hat = candidates[index]
        return goal_hat

    def observe_controller(self, s_t, a_t, s_t1, r_t, done):
        self.controller.memory.store(s_t, a_t, s_t1, r_t, done)
    def observe_meta_controller(self, s_t, a_t, s_t1, r_t, done):
        self.meta_controller.memory.store(s_t, a_t, s_t1, r_t, done)

    def select_goal(self, s_t, warmup=True, decay_epsilon=True):
        return self.meta_controller.select_action(s_t, warmup, decay_epsilon) * self.max_goal_dist
    def select_action(self, s_t, g_t, warmup=True, decay_epsilon=True):
        sg_t = torch.cat([s_t, g_t], 1).float()
        return self.controller.select_action(sg_t, warmup, decay_epsilon)

In [14]:
import time
SAVE_OFFSET = 8
def train_model():
    global SAVE_OFFSET
    n_observations = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]
    
    agent = HIRO(n_observations, n_actions).to(device)
    
    max_episode_length = 200
    
    episode_reward = 0.
    observation = None
    
    warmup = 100
    num_episodes = 4000 # M
    episode_durations = []
    goal_durations = []

    steps = 0
    c = 10

    for i_episode in range(num_episodes):
        observation = env.reset()
        state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
        
        overall_reward = 0
        episode_steps = 0
        done = False
        goals_done = 0

        while not done:
            goal = agent.select_goal(state, i_episode <= warmup)
            goal_durations.append((steps, goal[:,0]))

            state_seq, action_seq = [], []
            first_goal = goal
            goal_done = False
            total_extrinsic = 0

            while not done and not goal_done:
                joint_goal_state = torch.cat([state, goal], axis=1).float()

                # agent pick action ...
                action = agent.select_action(state, goal, i_episode <= warmup)
                
                # env response with next_observation, reward, terminate_info
                observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())
                steps += 1
                next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                next_goal = agent.h(state, goal, next_state)
                joint_next_state = torch.cat([next_state, next_goal], axis=1).float()
                
                if max_episode_length and episode_steps >= max_episode_length - 1:
                    done = True
                    
                extrinsic_reward = torch.tensor([reward], device=device)
                #intrinsic_reward = agent.intrinsic_reward(state, goal, next_state).unsqueeze(0)
                intrinsic_reward = agent.intrinsic_reward(action, goal).unsqueeze(0)

                overall_reward += reward
                total_extrinsic += reward

                #goal_done = agent.goal_reached(state, goal, next_state)
                goal_done = agent.goal_reached(action, goal)

                # agent observe and update policy
                agent.observe_controller(joint_goal_state, action, joint_next_state, intrinsic_reward, done)

                state_seq.append(state)
                action_seq.append(action)

                episode_steps += 1

                if goal_done:
                    goals_done += 1
                
                if goal_done: #or (episode_steps % c) == 0:
                    # perform off-policy correction and store
                    goal_hat = agent.off_policy_correction(action_seq, state_seq, first_goal, next_state)
                    agent.observe_meta_controller(state_seq[0], goal_hat, next_state,  torch.tensor([total_extrinsic], device=device), done)
                    goal_done = True

                    if i_episode > warmup:
                        agent.teach_meta_controller()

                state = next_state
                goal = next_goal
                
                if i_episode > warmup:
                    agent.teach_controller()

        #goal_durations.append((i_episode, goals_done))
        episode_durations.append((i_episode, overall_reward))
        #plot_durations(episode_durations,goal_durations)

        _, dur = list(map(list, zip(*episode_durations)))
        if len(dur) > 100:
            if np.mean(dur[-100:]) >= 90: #90 over 100
                print(f"Solved after {i_episode} episodes!")
                save_model(agent, f"hiro_car_{SAVE_OFFSET}")
                SAVE_OFFSET += 1
                return agent

    return None # did not train

In [15]:
state_max = torch.from_numpy(np.array(env.observation_space.high)).to(device).float()
def fgsm_attack(data, eps, data_grad):
    sign_data_grad = data_grad.sign()

    perturbed_data = data + eps * sign_data_grad * state_max

    clipped_perturbed_data = torch.max(torch.min(perturbed_data, state_max), -state_max)

    return clipped_perturbed_data

def fgsm_goal(g_state, agent, eps, target, targetted):
    #g_state = torch.tensor(g_state, requires_grad=True)
    g_state = g_state.clone().detach().requires_grad_(True)

    # initial forward pass
    goal = agent.meta_controller.actor(g_state)
    goal = torch.clamp(goal, -1., 1.)

    if not targetted:
        loss = F.mse_loss(goal, target)
    else:
         # if < 0 then -1 else 1
        loss = F.mse_loss(goal, target if goal > 0 else -target)
    agent.meta_controller.actor.zero_grad()

    # calc loss
    loss.backward()
    data_grad = g_state.grad.data

    # perturb state
    g_state_p = fgsm_attack(g_state, eps, data_grad).float()
    return agent.select_goal(g_state_p, False)

def fgsm_action(state, goal, agent, eps, target, targetted):
    #state = torch.tensor(state, requires_grad=True)
    state = state.clone().detach().requires_grad_(True)

    sg_t = torch.cat([state, goal], 1).float()
    # initial forward pass
    action = agent.controller.actor(sg_t)
    action = torch.clamp(action, -1., 1.)

    if not targetted:
        loss = F.mse_loss(action, target)
    else:
        loss = F.mse_loss(action, target if action > 0 else -target)
    agent.controller.actor.zero_grad()

    # calc loss
    loss.backward()
    data_grad = state.grad.data
    # perturb state
    state_p = fgsm_attack(state, eps, data_grad).float()
    return agent.select_action(state_p, goal, False)

def apply_fgsm(agent, episode_durations, goal_attack, action_attack, targetted):
    TARGET_GOAL = torch.tensor([[1.0]], device=device, dtype=torch.float)
    TARGET_ACTION = torch.tensor([[1.0]], device=device, dtype=torch.float)

    agent.eval()
    agent.meta_controller.eval()
    agent.controller.eval()

    max_episode_length = 200
    agent.meta_controller.is_training = False
    agent.controller.is_training = False

    num_episodes = 100

    for eps in np.arange(0.0, 0.031, 0.0025):

        overall_reward = 0
        for i_episode in range(num_episodes):
            observation = env.reset()

            state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
            g_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            episode_steps = 0
            done = False
            while not done:
                if goal_attack:
                    goal = fgsm_goal(g_state, agent, eps, TARGET_GOAL, targetted)
                else:
                    goal = agent.select_goal(g_state, False)

                goal_done = False
                while not done and not goal_done:
                    if action_attack:
                        action = fgsm_action(state, goal, agent, eps, TARGET_ACTION, targetted)
                    else:
                        action = agent.select_action(state, goal, False)
                    
                    observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())

                    next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                    g_next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

                    next_goal = agent.h(g_state, goal, g_next_state)
                                      
                    overall_reward += reward

                    if max_episode_length and episode_steps >= max_episode_length - 1:
                        done = True
                    episode_steps += 1

                    goal_done = agent.goal_reached(action, goal)
                    #goal_done = agent.goal_reached(g_state, goal, g_next_state)

                    state = next_state
                    g_state = g_next_state
                    goal = next_goal

        episode_durations[eps].append(overall_reward / num_episodes)

In [16]:
def plot_fgsm(episode_durations):
    plt.figure(2, figsize=(10,10))
    
    for kk in ['both', 'goal_only', 'action_only']:
        x, ys = np.array(list(episode_durations[kk].keys())), np.array(list(episode_durations[kk].values()))
        #plt.title('Action Prediction $\mu$ and $\pm \sigma$ interval')
        plt.xlabel('$\epsilon$')
        plt.ylabel('Average Reward')
        
        mu = np.mean(ys, axis=1)
        plt.plot(x, mu, label=kk)
        stds = np.std(ys, axis = 1)
        plt.fill_between(x, mu + stds , mu - stds, alpha=0.2)
    
    plt.legend()
    plt.pause(0.001)  # pause a bit so that plots are updated
    display.clear_output(wait=True)

In [17]:
state_max = torch.from_numpy(np.array(env.observation_space.high)).to(device)
def eval_model(agent, episode_durations, goal_attack, action_attack, same_noise):
    agent.eval()
    agent.meta_controller.eval()
    agent.controller.eval()

    max_episode_length = 200
    agent.meta_controller.is_training = False
    agent.controller.is_training = False

    num_episodes = 100

    for l2norm in np.arange(0,0.31,0.03):

        overall_reward = 0
        for i_episode in range(num_episodes):
            observation = env.reset()

            state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
            g_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
            
            noise = torch.FloatTensor(state.shape).uniform_(-l2norm/2, l2norm/2).to(device)

            if goal_attack:
                g_state = state + state_max * noise
                g_state = g_state.float()

            if action_attack:
                if same_noise:
                    state = state + state_max * noise
                else:
                    state = state + state_max * torch.FloatTensor(state.shape).uniform_(-l2norm/2, l2norm/2).to(device)
                state = state.float()

            episode_steps = 0
            done = False
            while not done:
                # select a goal
                goal = agent.select_goal(g_state, False)

                goal_done = False
                while not done and not goal_done:
                    action = agent.select_action(state, goal, False)
                    observation, reward, done, info = env.step(action.detach().cpu().squeeze(0).numpy())

                    next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)
                    g_next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

                    noise = torch.FloatTensor(state.shape).uniform_(-l2norm/2, l2norm/2).to(device)
                    if goal_attack:
                        g_next_state = next_state + state_max * noise
                        g_next_state = g_next_state.float()
                    if action_attack:
                        if same_noise:
                            next_state = next_state + state_max * noise
                        else:
                            next_state = next_state + state_max * torch.FloatTensor(next_state.shape).uniform_(-l2norm/2, l2norm/2).to(device)
                        next_state = next_state.float()

                    next_goal = agent.h(g_state, goal, g_next_state)
                                      
                    overall_reward += reward

                    if max_episode_length and episode_steps >= max_episode_length - 1:
                        done = True
                    episode_steps += 1

                    goal_done = agent.goal_reached(action, goal)
                    #goal_done = agent.goal_reached(g_state, goal, g_next_state)

                    state = next_state
                    g_state = g_next_state
                    goal = next_goal

        episode_durations[l2norm].append(overall_reward / num_episodes)

In [18]:
def plot_norms(episode_durations):
    plt.figure(2, figsize=(10,10))
    
    x, ys = np.array(list(episode_durations.keys())), np.array(list(episode_durations.values()))
    #plt.title('Action Prediction $\mu$ and $\pm \sigma$ interval')
    plt.xlabel('L2 Norm')
    plt.ylabel('Average Reward')
    
    mu = np.mean(ys, axis=1)
    plt.plot(x, mu)
    stds = np.std(ys, axis = 1)
    plt.fill_between(x, mu + stds , mu - stds, alpha=0.2)
        
    plt.pause(0.001)  # pause a bit so that plots are updated
    display.clear_output(wait=True)

In [None]:
targeted = {'both': {}, 'goal_only': {}, 'action_only': {}}
untargeted = {'both': {}, 'goal_only': {}, 'action_only': {}}
for eps in np.arange(0.0, 0.031, 0.0025):
    for x in ['both', 'goal_only', 'action_only']:
        targeted[x][eps] = []
        untargeted[x][eps] = []

#for i in range(27):
#    agent = HIRO(n_observations, n_actions).to(device)
#    load_model(agent, f"hiro_{i}")

n_observations = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

i = 0
while i < 25:
    #agent = train_model()
    agent = HIRO(n_observations, n_actions).to(device)
    load_model(agent, f"hiro_car_{i}")

    if agent is not None:
        apply_fgsm(agent, targeted['both'], True, True, True)
        apply_fgsm(agent, targeted['goal_only'], True, False, True)
        apply_fgsm(agent, targeted['action_only'], False, True, True)
        apply_fgsm(agent, untargeted['both'], True, True, False)
        apply_fgsm(agent, untargeted['goal_only'], True, False, False)
        apply_fgsm(agent, untargeted['action_only'], False, True, False)
        print(i)
        print(f"Targeted: {targeted}")
        print(f"Untargeted: {untargeted}")
        #plot_fgsm(episode_durations)
        i += 1

#plot_fgsm(episode_durations)
print(f"Targeted: {targeted}")
print(f"Untargeted: {untargeted}")

In [None]:
same_noise = {}
diff_noise = {}
goal_only = {}
action_only = {}
for l2norm in np.arange(0,0.31,0.03):
    for i in [same_noise, diff_noise, goal_only, action_only]:
        i[l2norm] = []

# train 20 models, then eval them
i = 8
while i < 20:
    agent = train_model()
    if agent is not None:
        # goal_attack, action_attack, same_noise
        eval_model(agent, same_noise, True, True, True)
        eval_model(agent, diff_noise, True, True, False)
        eval_model(agent, goal_only, True, False, False)
        eval_model(agent, action_only, False, True, False)
        print(i)
        print(f"same noise: {same_noise}")
        print(f"diff noise: {diff_noise}")
        print(f"goal only: {goal_only}")
        print(f"action only: {action_only}")
        i += 1

print(f"same noise: {same_noise}")
print(f"diff noise: {diff_noise}")
print(f"goal only: {goal_only}")
print(f"action only: {action_only}")

Solved after 2066 episodes!
8
same noise: {0.0: [92.59095277633676], 0.03: [92.35395171599171], 0.06: [92.20388535661603], 0.09: [92.71169753011381], 0.12: [92.59618738458768], 0.15: [92.7584651948859], 0.18: [92.36434758530817], 0.21: [92.40262899203535], 0.24: [92.38610653439387], 0.27: [92.30187901271091], 0.3: [91.43363380005451]}
diff noise: {0.0: [92.62273287520318], 0.03: [92.44289448742917], 0.06: [92.83069309145966], 0.09: [92.13894283003468], 0.12: [92.74252931500152], 0.15: [92.64742785497718], 0.18: [92.61343740518733], 0.21: [92.25343563176257], 0.24: [92.55239501351099], 0.27: [92.40034216128683], 0.3: [91.992329417301]}
goal only: {0.0: [92.47303022563243], 0.03: [92.67504889484393], 0.06: [92.68961129536734], 0.09: [92.52384252923561], 0.12: [92.68963870412948], 0.15: [92.16730989597505], 0.18: [92.48640872953753], 0.21: [92.41346093249773], 0.24: [92.08254049230072], 0.27: [92.01784836182223], 0.3: [92.0280654340321]}
action only: {0.0: [92.5620813146069], 0.03: [92.41