In [1]:
import numpy as np

## Create Grid World

In [2]:
class Gridworld:
    def __init__(self, height, width, coins=[(0, 4)], ghosts=[(2,4)], walls=[(1,1),(1,2),(1,3),(2,1)]):
        self.height = height
        self.width = width
        
        self.walls = walls
        self.grid = []
        
        self.terminal_states = coins + ghosts
        
        self.pacman = {'x': 4, 'y': 0}
        
        # create empty array of 0's for grid
        for i in range(height):
            row = []
            for j in range(width):
                row.append(0)
            
            self.grid.append(row)

        # Place coins in grid         
        for coin in coins:
            self.grid[coin[0]][coin[1]] = 1
        
        # Place ghosts in grid
        for ghost in ghosts:
            self.grid[ghost[0]][ghost[1]] = -1
        
        # Place walls in grid
        for wall in walls:
            self.grid[wall[0]][wall[1]] = 'x'
            
    def show(self):
        line = " "
        for i in range(self.width):
            line += " ---"
        
        print(line)
        for y in range(self.height):    
            for x in range(self.width):
                print(' | ',end='')
                
                if (self.pacman['x'] == x and self.pacman['y'] == y):
                    print('p',end='')
                else:
                    print(self.grid[y][x],end='')
            print(" |")
            print(line)
    
    def available_actions(self):
        actions = []
        locations = []
        
        # Check for Up
        if self.pacman["y"] - 1 >= 0:
            new_loc = {'x': self.pacman["x"], 'y': self.pacman["y"] - 1}
                       
            if self.grid[new_loc['y']][new_loc['x']] != 'x':
                actions.append("UP")
                locations.append(new_loc)

       # Check for Right        
        if self.pacman["x"] + 1 < self.width:
            new_loc = {'x': self.pacman["x"] + 1, 'y': self.pacman["y"]}
            if self.grid[new_loc['y']][new_loc['x']] != 'x':
                actions.append("RIGHT")
                locations.append(new_loc)
                       
        # Check for Down            
        if self.pacman["y"] + 1 < self.height:
            new_loc = {'x': self.pacman["x"], 'y': self.pacman["y"] + 1 }
            if self.grid[new_loc['y']][new_loc['x']] != 'x':
                actions.append("DOWN")
                locations.append(new_loc)
                       
        # Check for Left            
        if self.pacman["x"] - 1 >= 0:
            new_loc = {'x': self.pacman["x"] - 1, 'y': self.pacman["y"]}
            if self.grid[new_loc['y']][new_loc['x']] != 'x':
                actions.append("LEFT")
                locations.append(new_loc)
        
        return actions, locations
        
    
    def move(self, direction):
        
        if direction == "UP":
            new_loc = {'x': self.pacman["x"], 'y': self.pacman["y"] - 1 if self.pacman["y"] - 1 > 0 else 0}
            if (self.grid[new_loc['y']][new_loc['x']] != 'x'):
                self.pacman = new_loc
            
        elif direction == "RIGHT":
            new_loc = {'x': self.pacman["x"] + 1 if self.pacman["x"] + 1 < self.width else self.pacman["x"], 'y': self.pacman["y"]}
            if (self.grid[new_loc['y']][new_loc['x']] != 'x'):
                self.pacman = new_loc
            
        elif direction == "DOWN":
            new_loc = {'x': self.pacman["x"], 'y': self.pacman["y"] + 1 if self.pacman["y"] + 1 < self.height else self.pacman["y"] }
            if (self.grid[new_loc['y']][new_loc['x']] != 'x'):
                self.pacman = new_loc
            
        elif direction == "LEFT":
            new_loc = {'x': self.pacman["x"] - 1 if self.pacman["x"] - 1 > 0 else 0, 'y': self.pacman["y"]}
            if (self.grid[new_loc['y']][new_loc['x']] != 'x'):
                self.pacman = new_loc
            
        else:
            print("Cannot find movement.")
        
        done = False
        if (self.grid[self.pacman['y']][self.pacman['x']] != 0):
            done = True
        
        return self.grid[self.pacman['y']][self.pacman['x']], done
    

g = Gridworld(4, 5)
print(g.available_actions())
g.show()


(['DOWN', 'LEFT'], [{'x': 4, 'y': 1}, {'x': 3, 'y': 0}])
  --- --- --- --- ---
 | 0 | 0 | 0 | 0 | p |
  --- --- --- --- ---
 | 0 | x | x | x | 0 |
  --- --- --- --- ---
 | 0 | x | 0 | 0 | -1 |
  --- --- --- --- ---
 | 0 | 0 | 0 | 0 | 0 |
  --- --- --- --- ---


## Formulate Bellman Equation.

The Bellman equation is defined as follows:

\begin{equation*}
V(s) = max_a ( R(s, a) + \gamma V(s')
\end{equation*}

- V = Value
- S = current state
- a = Action
- R = Reward
- gamma = discount factor
- s' = prime means next state

Completley detrministicly - the actions we take happen 100% of the time.

### Gamma
If gamma is equal to one, every square is going to have a value of one... make this into some sort of question and then an exercise to work this out.

Higher number = long term thinking.

In [5]:
height = 4
width = 5
actions = 4
gamma = 0.9

policy =[]
for i in range(height):
    row = []
    for j in range(width):
        row.append('x')

    policy.append(row)

g = Gridworld(4, 5)


# Starting at the end location work backwards to populate policy
def calculate_value_for_location(y, x, start=False):
#     print("---")
#     print(str(x) + ", " + str(y))

    g.pacman = {'x': x, 'y': y}
#     g.show()
    available_actions, available_locations = g.available_actions()
    reward_for_current_location  = g.grid[y][x]
    
#     print("Available actions: " + str(available_actions))
    
    if (policy[y][x] != 'x'):
#         print("Already done... exiting")
        return
        
        

    for terminal_state in g.terminal_states:
        if (y == terminal_state[0] and x == terminal_state[1]):
            policy[y][x] = reward_for_current_location
            if(start == False):
                return

    value_of_available_locations = [policy[loc['y']][loc['x']] if policy[loc['y']][loc['x']] != 'x' else 0 for loc in available_locations]
#     print(value_of_available_locations)
    reward_for_current_location  = g.grid[y][x]
    reward_for_current_location = reward_for_current_location + gamma * max(value_of_available_locations)

    policy[y][x] = round(reward_for_current_location, 2)
#     print(policy)
#     print(available_actions)

    for loc, action in zip(available_locations, available_actions):
#         print("Action taken: " + str(action))
#         print("location: " + str(loc))
#         input()
        calculate_value_for_location(y=loc['y'], x=loc['x'])
        
        
calculate_value_for_location(0,4, start=True)

In [6]:
policy

[[0.66, 0.73, 0.81, 0.9, 1.0],
 [0.59, 'x', 'x', 'x', 0.9],
 [0.53, 'x', 0.35, 0.32, -1],
 [0.48, 0.43, 0.39, 0.35, 0.32]]

A problem occures when there is non-deterministic environment. To know the value of the current state you need to knwo the valye of the next statem and to knwo the next state you need to know the previous state, and so on. Need a new method.