# Solving Grid World with Value Propagation Networks and Actor-Critic methods
This notebook is intended to recreate the main results of this project, namely a comparison between three different models
* A random agent (baseline)
* An Actor-Critic agent
* A VPN agent

We start out by defining the environment. The environment has been created from scratch and can be inspected in GridWorld.py.
The environment has multiple difficulties for agents. We will be using level 4, which entails random start position, random goal position and random walls, which means the agents have to learn how to navigate random mazes.
For simplicity, we will be working on a 5x5 grid, which still makes this a non-trivial task.

In [None]:
from GridWorld import GridWorld
# Hyperparameters
#GIVE_UP = 15
#N_EPISODES = 10_000
LEVEL = 4
MAP_SIZE = 5
MAP = [MAP_SIZE] * 4
WALL_PCT = 0.32
#TEST_COUNT = 200
#LOG_INTERVAL = 400
#DO_INTERMEDIATE_TESTS = True

#TEST_SIZE = 100
#LEARNING_RATE = 0.001
#GAMMA = 0.99
SEED = 543
#REGULARIZATION_SCALAR = 0.002
#FPS = 0

env = GridWorld(map=MAP, seed=SEED, non_diag=False, rewards=(0.0, 1.0), wall_pct=WALL_PCT)

env.set_level(LEVEL)

Importing for the neural networks:

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as f
from math import prod

Here we define our VPN class.

In [None]:
K = 10 # Iterations in planning module

class VPN(nn.Module):
    def __init__(self):
        super(VPN, self).__init__()
        hidden_units = 32
        hidden_units2 = 64
        hidden_units_policy1 = 32

        self.n_observation1 = env.observation_space.shape[1]
        self.n_observation2 = env.observation_space.shape[2]
        n_state_dims = self.n_observation1*self.n_observation2
        n_actions = len(env.DIRS)

        #input should contain
        self.affine1 = nn.Linear(prod(env.observation_space.shape), hidden_units)
        self.affine2 = nn.Linear(hidden_units, hidden_units2)

        # r_out's head
        self.r_out = nn.Linear(hidden_units2, n_state_dims)

        # r_in's head
        self.r_in = nn.Linear(hidden_units2, n_state_dims)

        # transition probability head
        self.p = nn.Linear(hidden_units2, n_state_dims)

        #policy network stuff
        self.policyNetwork1 = nn.Linear(n_state_dims*4, hidden_units_policy1) #3 because we don't use the transition probabilities
        self.policyHead = nn.Linear(hidden_units_policy1, n_actions)


        # action & reward buffer
        self.saved_actions = []
        self.saved_probabilities_of_actions = []
        self.rewards = []
        self.shape_of_board = (env.observation_space.shape[1], env.observation_space.shape[2])
        self.v_current = torch.zeros(self.shape_of_board)
        self.v_next = torch.zeros(self.shape_of_board)
        #self.v = torch.zeros(self.shape_of_board)
        #self.values = np.zeros(())
    def forward(self, x):
        """
        Assumes x to be a (3, i, j) shape
        """
        current_position = (x[1]==1).nonzero()
        x = x.flatten()

        x = torch.from_numpy(x).float()
        state = x
        x = f.relu(self.affine1(x))
        x = f.relu(self.affine2(x))

        r_out = torch.sigmoid(self.r_out(x))

        r_out = torch.reshape(r_out, self.shape_of_board)


        r_in = torch.sigmoid(self.r_in(x))
        r_in = torch.reshape(r_in, self.shape_of_board)


        p = torch.sigmoid(self.p(x))
        p = torch.reshape(p, self.shape_of_board)

        #value iteration

        #For all neighborhoods for all states, we define the value of the state, as the value of having taking the best action
        #We do this K times
        #Notably, because we do this for all states we can get information from states infinitely long away!

        #self.v = torch.zeros(self.shape_of_board)

        # Padding all grids with zeros
        v = f.pad(torch.zeros(self.shape_of_board), (1,1,1,1))
        p     = f.pad(p, (1,1,1,1))
        r_in  = f.pad(r_in, (1,1,1,1))
        r_out = f.pad(r_out, (1,1,1,1))

        for k in range(K):
            i = 0
            helper = torch.zeros((8, self.n_observation1, self.n_observation2)) # 8 directions
            # helper = torch.zeros((9, self.n_observation1, self.n_observation2)) # Stay + 8 directions
            for i_dot, j_dot in env.DIRS:  # For all directions (env uses 0 dim as x and 1 dim as y)

                #logic of indexing: Applied the same for v, p, r_in, r_out
                #we take the padded x, index only the "inner" v by 1:1+shape_of_board, then
                #move the "square" we index in the direction of i_dot, j_dot
                xs, xe = j_dot+1, 1+j_dot+self.shape_of_board[0]  # +1 because of padding
                ys, ye = i_dot+1, 1+i_dot+self.shape_of_board[1]
                helper[i] = v[xs:xe,     ys:ye] *  \
                            p[xs:xe,     ys:ye] +  \
                            r_in[xs:xe,  ys:ye] - \
                            r_out[xs:xe, ys:ye]
                i +=1
            # just the previous v without the padding

            v = helper.max(dim=0)[0]  # max over the neighborhood
            if k < K-1:  # don't pad if it's the last round
                v = f.pad(v, (1,1,1,1))

        #policy
        input_to_policy = torch.cat((v.flatten(), state), 0)
        action_logits = f.relu(self.policyNetwork1(input_to_policy))
        action_logits = self.policyHead(action_logits)
        action_prob = f.softmax(action_logits, dim=-1)

        #value at current state

        state_value = v[current_position]

        return action_prob, state_value
