In [6]:
import numpy as np

# Define the MDP
# Example: Gridworld with states, actions, transitions, rewards, and gamma (discount factor)
states = ['S1', 'S2', 'S3']
actions = ['up', 'down', 'left', 'right']
transitions = {
    'S1': {
        'up': {'S1': 0.8, 'S2': 0.1, 'S3': 0.1},
        'down': {'S1': 0.2, 'S2': 0.3, 'S3': 0.5},
        'left': {'S1': 0.5, 'S2': 0.2, 'S3': 0.3},
        'right': {'S1': 0.1, 'S2': 0.7, 'S3': 0.2},
    },
    'S2': {
        'up': {'S1': 0.3, 'S2': 0.4, 'S3': 0.3},
        'down': {'S1': 0.1, 'S2': 0.2, 'S3': 0.7},
        'left': {'S1': 0.6, 'S2': 0.1, 'S3': 0.3},
        'right': {'S1': 0.4, 'S2': 0.5, 'S3': 0.1},
    },
    'S3': {
        'up': {'S1': 0.5, 'S2': 0.3, 'S3': 0.2},
        'down': {'S1': 0.4, 'S2': 0.1, 'S3': 0.5},
        'left': {'S1': 0.2, 'S2': 0.6, 'S3': 0.2},
        'right': {'S1': 0.3, 'S2': 0.4, 'S3': 0.3},
    },
    # Define transitions for other states similarly
}

rewards = {
    'S1': {
        'up': {'S1': 0, 'S2': 1, 'S3': -1},
        'down': {'S1': 0, 'S2': -1, 'S3': 1},
        'left': {'S1': 0, 'S2': 0, 'S3': 0},
        'right': {'S1': 0, 'S2': 0, 'S3': 0},
    },
    'S2': {
        'up': {'S1': 1, 'S2': -1, 'S3': 0},
        'down': {'S1': 0, 'S2': 0, 'S3': 1},
        'left': {'S1': -1, 'S2': 0, 'S3': 0},
        'right': {'S1': 0, 'S2': 1, 'S3': -1},
    },
    'S3': {
        'up': {'S1': -1, 'S2': 0, 'S3': 1},
        'down': {'S1': 0, 'S2': 1, 'S3': -1},
        'left': {'S1': 0, 'S2': -1, 'S3': 1},
        'right': {'S1': 1, 'S2': 0, 'S3': 0},
    },
    # Define rewards for other states similarly
}


gamma = 0.9  # Discount factor

In [7]:
# Define functions to compute the Bellman Expectation Equation for state values
def bellman_expectation(state_values, state, action):
    value = 0
    for next_state in transitions[state][action]:
        transition_prob = transitions[state][action][next_state]
        reward = rewards[state][action][next_state]
        value += transition_prob * (reward + gamma * state_values[next_state])
    return value

# Initialize state values arbitrarily
state_values = {state: np.random.random() for state in states}

# Value iteration to calculate state values
num_iterations = 100
for i in range(num_iterations):
    new_state_values = state_values.copy()
    for state in states:
        values = [bellman_expectation(state_values, state, action) for action in actions]
        new_state_values[state] = max(values)  # Update state value
    state_values = new_state_values

# Print the final state values
print("Final State Values:")
for state, value in state_values.items():
    print(f"{state}: {value:.3f}")

Final State Values:
S1: 3.838
S2: 4.317
S3: 3.959
