In [13]:
import numpy as np
import pandas as pd
import seaborn as sns

# Value Iteration

In [14]:
# Initialize thevariables
cols = 4
rows = 3

gamma = 0.9
epsilon = 0.001

R = [[-0.04,-0.04,-0.04,1],
     [-0.04,-0.04,-0.04,-1],
     [-0.04,-0.04,-0.04,-0.04]]

U = [[0,0,0,1],
     [0,0,0,-1],
     [0,0,0,0],
     [0,0,0,0]]

p_intended = 0.8
p_slip = 0.1

actions = {
    'up':    (0, -1),
    'down':  (0, 1),
    'left':  (-1, 0),
    'right': (1, 0)
}

In [15]:
U = [[0.0 for x in range(cols)] for y in range(rows)]

In [16]:
# Initialize the Terminal State utilities
U[1][3] = -1
U[0][3] = 1

In [17]:
def in_grid(x,y):
  return 0<x<cols and 0<y<rows

In [18]:
def get_next_states(x,y,action):
  dx,dy = actions[action]

  ix,iy = dx+x, dy+y

  if not in_grid(ix,iy):
    ix,iy = x,y

  if action in ['up', 'down']:
    perp_actions = ['left', 'right']
  else:
    perp_actions = ['up', 'down']

  perp_states = []
  for perp_action in perp_actions:
    pdx, pdy = actions[perp_action]
    px, py = x + pdx, y + pdy
    if not in_grid(px, py):
        px, py = x, y
    perp_states.append((px, py))

  return [
        (p_intended, ix, iy),
        (p_slip, perp_states[0][0], perp_states[0][1]),
        (p_slip, perp_states[1][0], perp_states[1][1])
    ]

terminal_states = [(3, 0), (3, 1)]

In [19]:
#Initialize Terminal States

terminal_states = [(3, 0), (3, 1)]

In [20]:
iteration = 0
while True:
    iteration += 1
    delta = 0
    U_new = [[0 for _ in range(cols)] for _ in range(rows)]

    for y in range(rows):
        for x in range(cols):
            if (x, y) in terminal_states:
                U_new[y][x] = R[y][x]
                continue

            max_utility = float('-inf')
            for action in actions:
                expected_utility = 0
                for (prob, nx, ny) in get_next_states(x, y, action):
                    expected_utility += prob * U[ny][nx]
                max_utility = max(max_utility, expected_utility)

            U_new[y][x] = R[y][x] + gamma * max_utility
            delta = max(delta, abs(U_new[y][x] - U[y][x]))

    U = U_new

    print(f"Iteration {iteration}:")
    for row in U:
        print(["{0:.3f}".format(v) for v in row])
    print()

    if delta < epsilon * (1 - gamma) / gamma:
        break

print("Converged after", iteration, "iterations.")

Iteration 1:
['-0.040', '-0.040', '-0.040', '1.000']
['-0.040', '-0.040', '-0.040', '-1.000']
['-0.040', '-0.040', '-0.040', '-0.040']

Iteration 2:
['-0.076', '-0.076', '-0.076', '1.000']
['-0.076', '-0.076', '-0.076', '-1.000']
['-0.076', '-0.076', '-0.076', '-0.076']

Iteration 3:
['-0.108', '-0.108', '-0.108', '1.000']
['-0.108', '-0.108', '-0.108', '-1.000']
['-0.108', '-0.108', '-0.108', '-0.108']

Iteration 4:
['-0.138', '-0.138', '-0.138', '1.000']
['-0.138', '-0.138', '-0.138', '-1.000']
['-0.138', '-0.138', '-0.138', '-0.138']

Iteration 5:
['-0.164', '-0.164', '-0.164', '1.000']
['-0.164', '-0.164', '-0.164', '-1.000']
['-0.164', '-0.164', '-0.164', '-0.164']

Iteration 6:
['-0.187', '-0.187', '-0.187', '1.000']
['-0.187', '-0.187', '-0.187', '-1.000']
['-0.187', '-0.187', '-0.187', '-0.187']

Iteration 7:
['-0.209', '-0.209', '-0.209', '1.000']
['-0.209', '-0.209', '-0.209', '-1.000']
['-0.209', '-0.209', '-0.209', '-0.209']

Iteration 8:
['-0.228', '-0.228', '-0.228', '1.0