In [1]:
def diplay_grid(grid):
    for row in grid:
        for state in row:
            if state == 'wall':
                print(f'| {state} |',end='')
            else:
                print(f'|  {state}   |',end='')
        print()
        print('----------------------------------')

In [2]:
def available_actions(grid,state,actions):
    action_aval = []
    for action in actions:
        row = state[0]+action[0]
        if row < 0 or row > 2:
            continue
        col = state[1]+action[1]
        if col < 0 or col > 3:
            continue
        if type(grid[row][col]) == str:
            continue
        action_aval.append(action)
    return action_aval

In [3]:
def get_policy(grid,actions):
    policy_grid = [[0,0,0,1],
        [0,'Wall', 0,-1],
        [0,0,0,0]
       ]
    dimensions = (len(grid),len(grid[0]))
    for row in range(dimensions[0]):
        for col in range(dimensions[1]):
            if grid[row][col] in [1,-1,'Wall']:
                continue
            action_lst = available_actions(grid,(row,col),actions)
            max_value = 0
            max_action = None 
            for action in action_lst:
                if max_value < grid[row+action[0]][col+action[1]]:
                    max_value = grid[row+action[0]][col+action[1]]
                    max_action = action
            if max_action == (1,0):
                  policy_grid[row][col] = 'Down'
            elif max_action == (-1,0):
                  policy_grid[row][col] = 'Up'
            elif max_action == (0,1):
                  policy_grid[row][col] = 'Right'
            elif max_action == (0,-1):
                  policy_grid[row][col] = 'Left'
    return policy_grid

In [4]:
def Q_values(grid,actions,gamma=0.9,noise=0.2,num_iter=100):
    dimensions = (len(grid),len(grid[0]))
    trans = 1-noise
    theta = 0.05
    for iteration in range(num_iter):
        delta = float('-inf')
        for row in range(dimensions[0]):
            for col in range(dimensions[1]):
                if grid[row][col] in [1,-1,'Wall']:
                    continue
                values = []
                action_lst = available_actions(grid,(row,col),actions)
                d_noise = noise / (len(action_lst)-1)
                for i in action_lst:
                    belman = trans * grid[row+i[0]][col+i[1]] * gamma
                    for j in action_lst:
                        if i == j:
                            continue
                        belman += d_noise * grid[row+j[0]][col+j[1]] * gamma
                    values.append(belman)
                diff = abs(grid[row][col]-round(max(values),2))
                delta = max(diff,delta)
                grid[row][col] = round(max(values),2)
        print(f"Iteration {iteration}")
        diplay_grid(grid)
        print(f"delta = {delta:0.2f}")
        if delta <= theta:
            break
    return grid
                

In [5]:
actions = [(1,0),(-1,0),(0,1),(0,-1)] # down, up , right, left

grid = [[0,0,0,1],
        [0,'Wall', 0,-1],
        [0,0,0,0]
       ]
diplay_grid(grid)

|  0   ||  0   ||  0   ||  1   |
----------------------------------
|  0   ||  Wall   ||  0   ||  -1   |
----------------------------------
|  0   ||  0   ||  0   ||  0   |
----------------------------------


In [6]:
value_grid = Q_values(grid,actions)

Iteration 0
|  0.0   ||  0.0   ||  0.72   ||  1   |
----------------------------------
|  0.0   ||  Wall   ||  0.43   ||  -1   |
----------------------------------
|  0.0   ||  0.0   ||  0.31   ||  0.04   |
----------------------------------
delta = 0.72
Iteration 1
|  0.0   ||  0.52   ||  0.81   ||  1   |
----------------------------------
|  0.0   ||  Wall   ||  0.52   ||  -1   |
----------------------------------
|  0.0   ||  0.22   ||  0.4   ||  0.11   |
----------------------------------
delta = 0.52
Iteration 2
|  0.37   ||  0.65   ||  0.83   ||  1   |
----------------------------------
|  0.27   ||  Wall   ||  0.54   ||  -1   |
----------------------------------
|  0.23   ||  0.33   ||  0.43   ||  0.13   |
----------------------------------
delta = 0.37
Iteration 3
|  0.52   ||  0.69   ||  0.83   ||  1   |
----------------------------------
|  0.42   ||  Wall   ||  0.55   ||  -1   |
----------------------------------
|  0.36   ||  0.37   ||  0.44   ||  0.14   |
-----------------

In [7]:
policy_grid = get_policy(value_grid,actions)
diplay_grid(policy_grid)

|  Right   ||  Right   ||  Right   ||  1   |
----------------------------------
|  Up   ||  Wall   ||  Up   ||  -1   |
----------------------------------
|  Up   ||  Right   ||  Up   ||  Left   |
----------------------------------
