<a href="https://colab.research.google.com/github/FrankHolzkamp/FundamentalsActiveInference/blob/main/POMDP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# Define the grid world environment
states = [(0, 0), (0, 1), (1, 0), (1, 1)]  # Grid positions
actions = ['up', 'down', 'left', 'right']
rewards = {(0, 1): 10, (1, 1): -10}  # Rewards for reaching certain states
terminal_states = [(0, 1), (1, 1)]

# Define the transition probabilities
def transition(state, action):
    if state in terminal_states:
        return state

    x, y = state
    if action == 'up':
        x = max(x - 1, 0)
    elif action == 'down':
        x = min(x + 1, 1)
    elif action == 'left':
        y = max(y - 1, 0)
    elif action == 'right':
        y = min(y + 1, 1)

    return (x, y)

# Initialize value function and policy
value_function = {state: 0 for state in states}
policy = {state: np.random.choice(actions) for state in states if state not in terminal_states}

# Parameters
gamma = 0.9  # Discount factor
theta = 0.0001  # Convergence threshold

# Policy Evaluation
def policy_evaluation(policy, value_function):
    while True:
        delta = 0
        for state in states:
            if state in terminal_states:
                continue

            v = value_function[state]
            new_state = transition(state, policy[state])
            reward = rewards.get(new_state, 0)
            value_function[state] = reward + gamma * value_function[new_state]
            delta = max(delta, abs(v - value_function[state]))

        if delta < theta:
            break

# Policy Improvement
def policy_improvement(policy, value_function):
    policy_stable = True

    for state in states:
        if state in terminal_states:
            continue

        old_action = policy[state]
        action_values = {}

        for action in actions:
            new_state = transition(state, action)
            reward = rewards.get(new_state, 0)
            action_values[action] = reward + gamma * value_function[new_state]

        best_action = max(action_values, key=action_values.get)
        policy[state] = best_action

        if old_action != best_action:
            policy_stable = False

    return policy_stable

# Policy Iteration
def policy_iteration():
    while True:
        policy_evaluation(policy, value_function)
        if policy_improvement(policy, value_function):
            break

policy_iteration()

# Output the optimal policy and value function
print("Optimal Policy:")
for state in states:
    if state not in terminal_states:
        print(f"State {state}: {policy[state]}")

print("\nValue Function:")
for state in states:
    print(f"State {state}: {value_function[state]:.2f}")

Optimal Policy:
State (0, 0): right
State (1, 0): up

Value Function:
State (0, 0): 10.00
State (0, 1): 0.00
State (1, 0): 9.00
State (1, 1): 0.00
