In [14]:
import numpy as np

# GridWorld parameters
ROWS = 3
COLS = 4
REWARD_STEP = -0.04
DISCOUNT = 0.9
THRESHOLD = 0.01
ACTIONS = ['U', 'D', 'L', 'R']
ACTION_TO_DELTA = {
    'U': (-1, 0),
    'D': (1, 0),
    'L': (0, -1),
    'R': (0, 1)
}

# Grid setup
terminal_states = {(0, 3): 1, (1, 3): -1}
walls = [(1, 1)]

utilities = np.zeros((ROWS, COLS))
policy = np.full((ROWS, COLS), ' ')

# Define transition probabilities (80% intended, 10% sides)
def get_transitions(state, action):
    i, j = state
    transitions = []

    def move(a):
        di, dj = ACTION_TO_DELTA[a]
        ni, nj = i + di, j + dj
        if 0 <= ni < ROWS and 0 <= nj < COLS and (ni, nj) not in walls:
            return (ni, nj)
        return (i, j)

    intended = move(action)
    left = move(ACTIONS[(ACTIONS.index(action) - 1) % 4])
    right = move(ACTIONS[(ACTIONS.index(action) + 1) % 4])

    transitions.append((0.8, intended))
    transitions.append((0.1, left))
    transitions.append((0.1, right))
    return transitions

# Value Iteration
def value_iteration():
    global utilities
    while True:
        delta = 0
        new_utilities = np.copy(utilities)
        for i in range(ROWS):
            for j in range(COLS):
                if (i, j) in terminal_states or (i, j) in walls:
                    continue
                max_utility = float('-inf')
                for action in ACTIONS:
                    expected_utility = sum(prob * utilities[next_state]
                                           for prob, next_state in get_transitions((i, j), action))
                    if expected_utility > max_utility:
                        max_utility = expected_utility
                new_utilities[i, j] = REWARD_STEP + DISCOUNT * max_utility
                delta = max(delta, abs(new_utilities[i, j] - utilities[i, j]))
        utilities = new_utilities
        if delta < THRESHOLD:
            break

# Policy Extraction
def extract_policy():
    for i in range(ROWS):
        for j in range(COLS):
            if (i, j) in terminal_states:
                policy[i, j] = str(terminal_states[(i, j)])
            elif (i, j) in walls:
                policy[i, j] = 'W'
            else:
                best_action = None
                max_utility = float('-inf')
                for action in ACTIONS:
                    expected_utility = sum(prob * utilities[next_state]
                                           for prob, next_state in get_transitions((i, j), action))
                    if expected_utility > max_utility:
                        max_utility = expected_utility
                        best_action = action
                policy[i, j] = best_action

# Run value iteration and extract policy
value_iteration()
extract_policy()

# Display results
print("Utilities:")
print(np.round(utilities, 2))

print("\nOptimal Policy:")
for row in policy:
    print(' '.join(row))


Utilities:
[[-0.13 -0.1  -0.05  0.  ]
 [-0.16  0.   -0.05  0.  ]
 [-0.16 -0.13 -0.09 -0.05]]

Optimal Policy:
R R R 1
U W R -
R R U U
