This is the gameboard

In [1]:
import numpy as np


class Board:

    def __init__(self):
        self.state = self.createInitialBoard()

    def createInitialBoard(self):
        self.position = (4, 0)
        self.goal = (4, 9)
        self.gameStatus = 0

        initialBoard = np.zeros((5, 10))

        initialBoard[self.position] = 2
        initialBoard[self.goal] = 1

        for i in range(1, 9):
            initialBoard[(4, i)] = -1

        return initialBoard

    def showBoard(self):
        print(self.state)
        print('\n')

    def checkConsequences(self, newPosition):
        if newPosition == self.goal:
            self.gameStatus = 1
            print("You won.")
        elif self.state[newPosition] == -1:
            self.gameStatus = -1
            print("You lost.")

    def getPosition(self):
        return self.position

    def getGameStatus(self):
        return self.gameStatus

    def moveUp(self):

        y, x = self.position

        if y > 0:
            y -= 1

            self.state[self.position] = 0

            newPosition = (y, x)
            self.checkConsequences(newPosition)

            self.position = newPosition
            self.state[self.position] = 2

    def moveDown(self):

        y, x = self.position

        if y < 4:

            y += 1

            self.state[self.position] = 0

            newPosition = (y, x)
            self.checkConsequences(newPosition)

            self.position = newPosition
            self.state[self.position] = 2

    def moveLeft(self):
        y, x = self.position

        if x > 0:

            x -= 1

            self.state[self.position] = 0

            newPosition = (y, x)
            self.checkConsequences(newPosition)

            self.position = newPosition
            self.state[self.position] = 2

    def moveRight(self):
        y, x = self.position

        if x < 9:

            x += 1

            self.state[self.position] = 0

            newPosition = (y, x)
            self.checkConsequences(newPosition)

            self.position = newPosition
            self.state[self.position] = 2

This defines the agent and its policy

In [6]:
import numpy as np
from Game import Board


class Agent:

    def __init__(self):
        self.q_valueTable = np.zeros((5, 10, 4))

    def execPolicy(self, gameboard: Board, greedy=False):
        epsilon = 0.1
        position = gameboard.getPosition()

        if greedy:
            return self.q_valueTable[position].argmax()
        else:
            if np.random.random() < epsilon:
                return np.random.choice(4)
            else:
                return self.q_valueTable[position].argmax()

    def execAction(self, gameboard: Board, actionIndex):
        if actionIndex == 0:
            gameboard.moveUp()
        elif actionIndex == 1:
            gameboard.moveDown()
        elif actionIndex == 2:
            gameboard.moveLeft()
        else:
            gameboard.moveRight()

    def getReward(self, gameboard: Board):
        status = gameboard.getGameStatus()

        if status == 0:
            return -1
        if status == -1:
            return -100
        else:
            return 0

    def updateTable(self, previousPosition, nextPosition, actionIndex, reward):

        discountFactor = 0.9
        learnStep = 0.5

        oldQValue = self.q_valueTable[previousPosition][actionIndex]
        actionIndexForNextPosition = self.q_valueTable[nextPosition].argmax()
        qValueForNextPosition = self.q_valueTable[nextPosition][actionIndexForNextPosition]

        self.q_valueTable[previousPosition][actionIndex] = oldQValue + learnStep * (
                    reward + discountFactor * qValueForNextPosition - oldQValue)


This uses both classes defined above to train the agent and shows the final result

In [4]:
import Game
import Agent

agent = Agent.Agent()

for i in range(100):
    game = Game.Board()

    while game.getGameStatus() == 0:
        firstPosition = game.getPosition()
        actionIndex = agent.execPolicy(game)
        agent.execAction(game, actionIndex)
        newPosition = game.getPosition()
        reward = agent.getReward(game)
        agent.updateTable(firstPosition, newPosition, actionIndex, reward)

game = Game.Board()
game.showBoard()

while game.getGameStatus() == 0:
    actionIndex = agent.execPolicy(game, greedy=True)
    agent.execAction(game, actionIndex)
    game.showBoard()

print(agent.q_valueTable)


You lost.
You lost.
You lost.
You lost.
You lost.
You lost.
You lost.
You lost.
You lost.
You lost.
You won.
You won.
You won.
You lost.
You won.
You lost.
You lost.
You won.
You lost.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You lost.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You lost.
You won.
You won.
You won.
You won.
You won.
You lost.
You won.
You won.
You lost.
You won.
You lost.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You lost.
You won.
You won.
You won.
You won.
You won.
You won.
You lost.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You won.
You lost.
You lost.
You won.
You won.
You won.
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0. 