In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [6]:
from grid_world.Gridworld import Gridworld

In [7]:
game = Gridworld(size=4, mode='static')

game.display()

array([['+', '-', ' ', 'P'],
       [' ', 'W', ' ', ' '],
       [' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ']], dtype='<U2')

In [8]:
game.makeMove('d')
game.makeMove('d')
game.makeMove('l')
game.display()

array([['+', '-', ' ', ' '],
       [' ', 'W', ' ', ' '],
       [' ', ' ', 'P', ' '],
       [' ', ' ', ' ', ' ']], dtype='<U2')

In [9]:
game.reward()

-1

In [10]:
game.board.render_np()

array([[[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 0]],

       [[1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]], dtype=uint8)

In [16]:
import numpy as np
import torch
import random
import matplotlib.pyplot as plt

l1 = 64
l2 = 150
l3 = 100
l4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(l1, l2),
    torch.nn.ReLU(),
    torch.nn.Linear(l2, l3),
    torch.nn.ReLU(),
    torch.nn.Linear(l3, l4),
)

loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

gamma = 0.9 # discount factor
epsilon = 1.0 # initialized as 1 and then decrease

In [17]:
action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 't',
}

In [18]:
epochs = 1000

def train(epochs):
    losses = []

    for i in range(epochs):
        game = Gridworld(size=4, mode='static')
        state_ = game.board.render_np().reshape(1, 64) \
            + np.random.rand(1, 64) / 10.0
        state1 = torch.from_numpy(state_).float()
        is_over = False

        while (not is_over):
            # runs the Q-network to calculate the Q values for all actions
            qval = model(state1)
            qval_ = qval.data.numpy()
            # use epsilon-greedy to select an action
            if random.random() < epsilon:
                action_ = np.random.randint(0, 4)
            else:
                action_ = np.argmax(qval_)

            # take the action
            action = action_set[action_]
            game.makeMove(action)
            # after making the move, finds the maximum Q value from the
            # new state
            state2_ = game.board.render_np().reshape(1, 64) \
                + np.random.rand(1, 64) / 10.0
            state2 = torch.from_numpy(state2_).float()
            reward = game.reward()
            with torch.no_grad():
                newQ = model(state2.reshape(1, 64))
            maxQ = torch.max(newQ)

            if reward == -1:
                Y = reward + (gamma * maxQ)
            else:
                Y = reward

            Y = torch.Tensor([Y]).detach()
            X = qval.squeeze()[action_]
            loss = loss_fn(X, Y)
            optimizer.zero_grad()
            loss.backend()
            losses.append(loss.item())
            optimizer.step()
            state1 = state2
            if reward != -1:
                is_over = True

        if epsilon > 0.1:
            epsilon -= (1/epochs)

    return losses


In [19]:
losses = train(epochs)

UnboundLocalError: local variable 'epsilon' referenced before assignment