In [1]:
import numpy as np

# States: 0 = A, 1 = B
# Actions: 0 = Left, 1 = Right
gamma = 0.9  # discount factor


In [2]:
# Transition probabilities and rewards
# P[s][a] = list of (probability, next_state, reward)
P = {
    0: {  # State A
        0: [(1.0, 0, 0)],        # Left -> Stay in A
        1: [(1.0, 1, 1)]         # Right -> Go to B, reward = 1
    },
    1: {  # State B
        0: [(1.0, 0, 2)],        # Left -> Go to A, reward = 2
        1: [(1.0, 1, 0)]         # Right -> Stay in B
    }
}

In [4]:
def bellman_update(V, state):
    """Performs Bellman update for a single state."""
    values = []
    for action in P[state]:
        q = 0
        for prob, next_state, reward in P[state][action]:
            q += prob * (reward + gamma * V[next_state])
        values.append(q)
    return max(values)  # Optimality Bellman Equation

In [5]:
# Initialize value function
V = np.zeros(2)


In [6]:
# Run Value Iteration
for _ in range(10):
    V_new = np.copy(V)
    for s in range(2):
        V_new[s] = bellman_update(V, s)
    V = V_new

print("Optimal Value Function:", V)


Optimal Value Function: [9.59842299 9.94122381]
