In [None]:
from google.colab import drive
drive.mount('/content/drive/')

#!cp "/content/drive/My Drive/Dissertation/preprocessing.py" .
#!cp -r "/content/drive/My Drive/Dissertation/gym_maze" .
#!cp -r "/content/drive/My Drive/Dissertation/envs" .

Mounted at /content/drive/


In [None]:
# for inference, not continued training
def save_model(model, name):
    path = f"/content/drive/My Drive/Dissertation/saved_models/{name}" 

    torch.save({
      'controller': model.state_dict(),
    }, path)

import copy
def load_model(model, name):
    path = f"/content/drive/My Drive/Dissertation/saved_models/{name}" 
    checkpoint = torch.load(path)

    model.load_state_dict(checkpoint['controller'], strict = False)
    #model.target.load_state_dict(model.state_dict(), strict = False)

    model.eval()
    #model.target.eval()

In [None]:
%matplotlib inline

import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from IPython import display
plt.ion()

device = torch.device("cuda")

In [None]:
env = gym.make('Acrobot-v1')

---
### Helper functions

In [None]:
def plot_durations(episode_durations):
    fig, axs = plt.subplots(2, figsize=(10,10))
    
    durations_t, durations = list(map(list, zip(*episode_durations)))
    durations = torch.tensor(durations, dtype=torch.float)
    
    fig.suptitle('Training')
    axs[0].set_xlabel('Episode')
    axs[0].set_ylabel('Reward')
    axs[1].set_xlabel('Episode')
    axs[1].set_ylabel('State Visits')
    
    axs[0].plot(durations_t, durations.numpy())
        
    plt.pause(0.001)  # pause a bit so that plots are updated
    display.clear_output(wait=True)

---
### Code

In [None]:
# (state, action) -> (next_state, reward, done)
transition = namedtuple('transition', ('state', 'action', 'next_state', 'reward', 'done'))

# replay memory D with capacity N
class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    # implemented as a cyclical queue
    def store(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        
        self.memory[self.position] = transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
def plot_norms(episode_durations):
    plt.figure(2, figsize=(10,10))
    
    x, ys = np.array(list(episode_durations.keys())), np.array(list(episode_durations.values()))
    
    plt.title('Action Prediction $\mu$ and $\pm \sigma$ interval')
    plt.xlabel('L2 Norm')
    plt.ylabel('Average Reward')
    
    mu = np.mean(ys, axis=1)
    plt.plot(x / 10, mu)
    stds = np.std(ys, axis = 1)
    plt.fill_between(x / 10, mu + stds , mu - stds, alpha=0.2)
        
    plt.pause(0.001)  # pause a bit so that plots are updated
    display.clear_output(wait=True)

In [None]:
BATCH_SIZE = 64
GAMMA = 0.99

def one_hot(n, v):
    a = np.zeros(n)
    a[v] = 1.0
    return np.expand_dims(a, axis=0)

def rev_one_hot(a):
    return np.where(a[0] > 0)[0][0]

class DQN(nn.Module):
    def __init__(self, inputs, outputs, mem_len = 2000000):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(inputs, 256)
        self.fc2 = nn.Linear(256, 256)
        self.head = nn.Linear(256, outputs)
        
        self.memory = ReplayMemory(mem_len)
        self.optimizer = None
        self.target = None # to keep parameters frozen while propogating losses
        
        self.n_actions = outputs
        self.steps_done = 0
        
        self.EPS_START = 1.0
        self.EPS_END = 0.1
        self.EPS_DECAY = 10000 # in number of steps

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.head(x)
    
    def act(self, state, is_training):
        if is_training:
            eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * (1. - min(1., self.steps_done / self.EPS_DECAY))
            self.steps_done += 1

            # With probability eps select a random action
            if random.random() < eps_threshold:
                return torch.tensor([[random.randrange(self.n_actions)]], device=device, dtype=torch.long)

        # otherwise select action = maxa Q∗(φ(st), a; θ)
        with torch.no_grad():
            return self(state).max(1)[1].view(1, 1)
    
    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        
        # in the form (state, action) -> (next_state, reward, done)
        transitions = self.memory.sample(BATCH_SIZE)
        batch = transition(*zip(*transitions))
        
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        done_mask = np.array(batch.done)
        not_done_mask = torch.from_numpy(1 - done_mask).float().to(device)
        
        current_Q_values = self(state_batch).gather(1, action_batch)
        # Compute next Q value based on which goal gives max Q values
        # Detach variable from the current graph since we don't want gradients for next Q to propagated
        next_max_q = self.target(next_state_batch).detach().max(1)[0]
        next_Q_values = not_done_mask * next_max_q
        # Compute the target of the current Q values
        target_Q_values = reward_batch + (GAMMA * next_Q_values)
        # Compute Bellman error (using Huber loss)
        loss = F.smooth_l1_loss(current_Q_values, target_Q_values.unsqueeze(1))
        
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.parameters():
            if param.grad is not None:
                param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

In [None]:
SAVE_OFFSET = 0

def train_model():
    global SAVE_OFFSET
    # Get number of actions and observations from gym action space
    n_actions = env.action_space.n
    n_observations = env.observation_space.shape[0]

    # Initialize action-value function Q with random weights
    dqnAgent = DQN(n_observations, n_actions).to(device)
    dqnAgent.target = DQN(n_observations, n_actions).to(device)

    # Optimizer
    learning_rate = 2.5e-4
    dqnAgent.optimizer = optim.RMSprop(dqnAgent.parameters(), lr=learning_rate)

    num_episodes = 2000 # M
    episode_durations = []

    for i_episode in range(num_episodes):
        observation = env.reset()
        # unsqueeze adds batch dimension
        state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

        overall_reward = 0
        done = False
        while not done:
            # Execute action a_t in emulator and observe reward r_t and image x_{t+1}
            action = dqnAgent.act(state, True)
            observation, reward, done, _ = env.step(action.item())
            extrinsic_reward = torch.tensor([reward], device=device)

            overall_reward += reward

            # preprocess φ_{t+1} = φ(s_{t+1})
            next_state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            # Store transition (φt, at, rt, φt+1) in D
            dqnAgent.memory.store(state, action, next_state, extrinsic_reward, done)

            state = next_state

            dqnAgent.experience_replay()

        # very needed! see https://stackoverflow.com/a/58730298
        if i_episode % 10 == 0:
            dqnAgent.target.load_state_dict(dqnAgent.state_dict(), strict = False)

        episode_durations.append((i_episode, overall_reward))
        #plot_durations(episode_durations)
        _, dur = list(map(list, zip(*episode_durations)))
        if len(dur) > 100:
            if i_episode % 200 == 0:
                print(f"Episode {i_episode}: {np.mean(dur[-100:])}")
            if np.mean(dur[-100:]) >= -90:
                print(f"Solved after {i_episode} episodes!")
                save_model(dqnAgent, f"dqn_acrobot_{SAVE_OFFSET}")
                SAVE_OFFSET += 1
                return dqnAgent

    return None # did not solve

In [None]:
#dqnAgent = train_model()

Episode 200: -231.24
Episode 400: -273.49
Episode 600: -148.46
Episode 800: -110.47
Episode 1000: -90.59
Solved after 1004 episodes!


In [None]:
state_max = torch.from_numpy(env.observation_space.high).to(device)
def eval_model(dqnAgent, episode_durations):
    dqnAgent.eval()

    max_episode_length = 500
    num_episodes = 100

    for noise in np.arange(0,0.31,0.03):
        overall_reward = 0

        for i_episode in range(num_episodes):
            observation = env.reset()
            # unsqueeze adds batch dimension
            state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            episode_steps = 0
            done = False
            while not done:
                state = state + state_max * torch.FloatTensor(state.shape).uniform_(-noise/2, noise/2).to(device)
                state = state.float()

                action = dqnAgent.act(state, False)
                observation, reward, done, _ = env.step(action.item())
                overall_reward += reward

                if max_episode_length and episode_steps >= max_episode_length - 1:
                    done = True
                episode_steps += 1

                state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

        episode_durations[noise].append(overall_reward / num_episodes)

In [None]:
state_min = torch.from_numpy(env.observation_space.low).to(device)
def fgsm_attack(data, eps, data_grad):
    sign_data_grad = data_grad.sign()

    perturbed_data = data + eps * sign_data_grad * state_max

    clipped_perturbed_data = torch.max(torch.min(perturbed_data, state_max), state_min)

    return clipped_perturbed_data

def fgsm_action(state, agent, eps, target, targetted):
    #state = torch.tensor(state, requires_grad=True)
    state_var = state.clone().detach().requires_grad_(True)
    
    # initial forward pass
    action = agent(state_var)
    #action = temp.max(1)[1].view(1, 1).float()

    if targetted:
        loss = F.smooth_l1_loss(action, target)
    else:
        pass
        #loss = F.smooth_l1_loss(action, temp.min(1)[1].view(1, 1).float())

    agent.zero_grad()

    # calc loss
    loss.backward()
    data_grad = state_var.grad.data
    # perturb state
    state_p = fgsm_attack(state, eps, data_grad)

    return agent.act(state_p, False)

def apply_fgsm(agent, episode_durations, targetted):
    TARGET_ACTION = torch.tensor([[0.0, 0.0, 0.0]], device=device, dtype=torch.float)

    agent.eval()

    max_episode_length = 500

    num_episodes = 100

    for eps in np.arange(0.0, 0.031, 0.0025):

        overall_reward = 0

        for i_episode in range(num_episodes):
            observation = env.reset()
            # unsqueeze adds batch dimension
            state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

            episode_steps = 0
            done = False
            while not done:
                action = fgsm_action(state, agent, eps, TARGET_ACTION, targetted)
                
                observation, reward, done, _ = env.step(action.item())
                overall_reward += reward

                if max_episode_length and episode_steps >= max_episode_length - 1:
                    done = True
                episode_steps += 1

                state = torch.from_numpy(observation).float().unsqueeze(0).to(device)

        episode_durations[eps].append(overall_reward / num_episodes)

In [None]:
fgsm_t = {}
for eps in np.arange(0.0, 0.031, 0.0025):
    fgsm_t[eps] = []

same_noise = {}
for l2norm in np.arange(0,0.31,0.03):
    same_noise[l2norm] = []

n_actions = env.action_space.n
n_observations = env.observation_space.shape[0]

i = 0
while i < 10:
    agent = DQN(n_observations, n_actions).to(device)
    load_model(agent, f"dqn_acrobot_{i}")

    eval_model(agent, same_noise)
    apply_fgsm(agent, fgsm_t, True)

    print(i)
    print(f"Noise: {same_noise}")
    print(f"Targeted FGSM: {fgsm_t}")
    i += 1

print(f"Noise: {same_noise}")
print(f"Targeted FGSM: {fgsm_t}")

0
Noise: {0.0: [-80.48], 0.03: [-80.84], 0.06: [-80.58], 0.09: [-84.83], 0.12: [-89.87], 0.15: [-93.46], 0.18: [-97.34], 0.21: [-105.35], 0.24: [-109.4], 0.27: [-123.85], 0.3: [-134.67]}
Targeted FGSM: {0.0: [-82.98], 0.0025: [-79.96], 0.005: [-79.57], 0.0075: [-80.12], 0.01: [-77.61], 0.0125: [-81.13], 0.015: [-82.3], 0.0175: [-81.36], 0.02: [-81.34], 0.0225: [-81.79], 0.025: [-84.09], 0.0275: [-88.92], 0.03: [-88.07]}
1
Noise: {0.0: [-80.48, -84.42], 0.03: [-80.84, -81.59], 0.06: [-80.58, -86.92], 0.09: [-84.83, -91.07], 0.12: [-89.87, -91.3], 0.15: [-93.46, -90.86], 0.18: [-97.34, -100.46], 0.21: [-105.35, -98.82], 0.24: [-109.4, -107.64], 0.27: [-123.85, -114.37], 0.3: [-134.67, -124.78]}
Targeted FGSM: {0.0: [-82.98, -83.27], 0.0025: [-79.96, -82.58], 0.005: [-79.57, -87.45], 0.0075: [-80.12, -83.2], 0.01: [-77.61, -90.41], 0.0125: [-81.13, -90.33], 0.015: [-82.3, -103.96], 0.0175: [-81.36, -92.7], 0.02: [-81.34, -88.32], 0.0225: [-81.79, -104.32], 0.025: [-84.09, -103.34], 0.0275

In [None]:
same_noise = {}
for l2norm in np.arange(0,0.31,0.03):
    same_noise[l2norm] = []

# train 10 models, then eval them
i = 0
while i < 10:
    agent = train_model()
    if agent is not None:
        # goal_attack, action_attack, same_noise
        eval_model(agent, same_noise)
        print(f"{i} {same_noise}")
        i += 1

print(same_noise)

Episode 200: -275.46
Episode 400: -311.74
Episode 600: -158.6
Episode 800: -94.45
Episode 1000: -91.34
Solved after 1055 episodes!
0 {0.0: [-81.59], 0.03: [-80.63], 0.06: [-81.6], 0.09: [-84.59], 0.12: [-91.8], 0.15: [-91.29], 0.18: [-97.47], 0.21: [-107.19], 0.24: [-115.91], 0.27: [-120.63], 0.3: [-131.92]}
Episode 200: -208.29
Episode 400: -345.39
Episode 600: -137.59
Episode 800: -91.0
Solved after 918 episodes!
1 {0.0: [-81.59, -84.66], 0.03: [-80.63, -82.78], 0.06: [-81.6, -79.71], 0.09: [-84.59, -85.32], 0.12: [-91.8, -93.77], 0.15: [-91.29, -92.61], 0.18: [-97.47, -99.27], 0.21: [-107.19, -106.66], 0.24: [-115.91, -107.8], 0.27: [-120.63, -116.09], 0.3: [-131.92, -123.55]}
Episode 200: -201.93
Episode 400: -279.17
Episode 600: -290.21
Episode 800: -113.58
Solved after 952 episodes!
2 {0.0: [-81.59, -84.66, -80.69], 0.03: [-80.63, -82.78, -81.83], 0.06: [-81.6, -79.71, -85.09], 0.09: [-84.59, -85.32, -84.94], 0.12: [-91.8, -93.77, -87.92], 0.15: [-91.29, -92.61, -94.6], 0.18: [-9