In [19]:
import numpy as np
import pandas as pd
import seaborn as sns

# Value Iteration

In [20]:
# Initialize thevariables
cols = 4
rows = 3

gamma = 0.9
epsilon = 0.001

R = [[-0.04,-0.04,-0.04,1],
     [-0.04,0,-0.04,-1],
     [-0.04,-0.04,-0.04,-0.04]]

U = [[0,0,0,1],
     [0,0,0,-1],
     [0,0,0,0]]

p_intended = 0.8
p_slip = 0.1

actions = {
    'up':    (0, 1),
    'down':  (0, -1),
    'left':  (-1, 0),
    'right': (1, 0)
}

walls = [(1,1)]

In [21]:
U = [[0.0 for x in range(cols)] for y in range(rows)]

In [22]:
# Initialize the Terminal State utilities
U[1][3] = -1
U[2][3] = 1

In [23]:
def in_grid(x,y):
  return 0<x<cols and 0<y<rows and (x!=1 and y!=1)

In [24]:
def get_next_states(x,y,action):
  dx,dy = actions[action]

  ix,iy = dx+x, dy+y

  if not in_grid(ix,iy):
    ix,iy = x,y

  if action in ['up', 'down']:
    perp_actions = ['left', 'right']
  else:
    perp_actions = ['up', 'down']

  perp_states = []
  for perp_action in perp_actions:
    pdx, pdy = actions[perp_action]
    px, py = x + pdx, y + pdy
    if not in_grid(px, py):
        px, py = x, y
    perp_states.append((px, py))

  return [
        (p_intended, ix, iy),
        (p_slip, perp_states[0][0], perp_states[0][1]),
        (p_slip, perp_states[1][0], perp_states[1][1])
    ]

In [25]:
#Initialize Terminal States

terminal_states = [(3, 1), (3, 2)]

In [26]:
iteration = 0
while True:
    iteration += 1
    delta = 0
    U_new = [[0 for _ in range(cols)] for _ in range(rows)]

    for y in range(rows):
        for x in range(cols):
            if (x, y) in terminal_states or (x, y) in walls:
                U_new[y][x] = R[y][x]
                continue

            max_utility = float('-inf')
            for action in actions:
                expected_utility = 0
                for (prob, nx, ny) in get_next_states(x, y, action):
                    expected_utility += prob * U[ny][nx]
                max_utility = max(max_utility, expected_utility)

            U_new[y][x] = R[y][x] + gamma * max_utility
            delta = max(delta, abs(U_new[y][x] - U[y][x]))

    U = U_new

    print(f"Iteration {iteration}:")
    for row in U:
        print(["{0:.3f}".format(v) for v in row])
    print()

    if delta < epsilon * (1 - gamma) / gamma:
        break

print("Converged after", iteration, "iterations.")

Iteration 1:
['-0.040', '-0.040', '-0.040', '1.000']
['-0.040', '0.000', '-0.040', '-1.000']
['-0.040', '-0.040', '0.680', '-0.040']

Iteration 2:
['-0.076', '-0.076', '-0.076', '1.900']
['-0.076', '0.000', '0.442', '-1.000']
['-0.076', '0.442', '0.572', '-0.040']

Iteration 3:
['-0.108', '-0.108', '-0.108', '2.710']
['-0.108', '0.000', '0.451', '-1.000']
['-0.108', '0.451', '0.475', '-0.040']

Iteration 4:
['-0.138', '-0.138', '-0.138', '3.439']
['-0.138', '0.000', '0.383', '-1.000']
['-0.138', '0.383', '0.387', '-0.040']

Iteration 5:
['-0.164', '-0.164', '-0.164', '4.095']
['-0.164', '0.000', '0.308', '-1.000']
['-0.164', '0.308', '0.309', '-0.040']

Iteration 6:
['-0.187', '-0.187', '-0.187', '4.686']
['-0.187', '0.000', '0.238', '-1.000']
['-0.187', '0.238', '0.238', '-0.040']

Iteration 7:
['-0.209', '-0.209', '-0.209', '5.217']
['-0.209', '0.000', '0.174', '-1.000']
['-0.209', '0.174', '0.174', '-0.040']

Iteration 8:
['-0.228', '-0.228', '-0.228', '5.695']
['-0.228', '0.000', '

In [27]:
policy = [['' for _ in range(cols)] for _ in range(rows)]

for y in range(rows):
    for x in range(cols):
        if (x, y) in terminal_states:
            policy[y][x] = 'T'  # Terminal
            continue

        best_action = None
        best_value = float('-inf')
        for action in actions:
            expected_utility = 0
            for (prob, nx, ny) in get_next_states(x, y, action):
                expected_utility += prob * U[ny][nx]
            if expected_utility > best_value:
                best_value = expected_utility
                best_action = action

        arrows = {'up': '↑', 'down': '↓', 'left': '←', 'right': '→'}
        policy[y][x] = arrows[best_action]

# Print policy
for row in policy:
    print(row)


['↑', '↑', '↑', '↑']
['↑', '↑', '↑', 'T']
['↑', '→', '→', 'T']
