In [1]:
import numpy as np
import matplotlib.pyplot as plt

def setup_grid_environment(reward_terminal):
    """
    Define the grid environment with rewards and transition probabilities.
    """
    grid_rewards = np.full(9, -1)  # Default penalty for non-terminal states
    grid_rewards[0] = reward_terminal  # Terminal reward (upper-left corner)
    grid_rewards[2] = 10  # Terminal reward (upper-right corner)

    transitions = np.zeros((9, 4, 9))  # Transition matrix (states, actions, outcomes)
    
    directions = [(-3, 0), (3, 0), (0, -1), (0, 1)]  # Offset for actions (up, down, left, right)

    for state in range(9):
        if state in [0, 2]:
            continue  # Skip terminal states

        for action in range(4):
            intended = state + directions[action][0] + directions[action][1]
            if not (0 <= intended < 9):
                intended = state

            transitions[state, action, intended] += 0.8  # Main direction

            for perp in [(action + 1) % 4, (action + 3) % 4]:
                alternative = state + directions[perp][0] + directions[perp][1]
                if not (0 <= alternative < 9):
                    alternative = state

                transitions[state, action, alternative] += 0.1  # Side directions

    return grid_rewards, transitions

reward_terminal = 10
rewards, transitions = setup_grid_environment(reward_terminal)

for state in range(9):
    if state in [0, 2]:
        continue
    print(f"State {state}:")
    for action in range(4):
        print(f"  Action {action}:")
        for outcome in range(9):
            prob = transitions[state, action, outcome]
            if prob > 0:
                print(f"    To State {outcome}: Probability = {prob}")



State 1:
  Action 0:
    To State 1: Probability = 0.8
    To State 2: Probability = 0.1
    To State 4: Probability = 0.1
  Action 1:
    To State 0: Probability = 0.1
    To State 1: Probability = 0.1
    To State 4: Probability = 0.8
  Action 2:
    To State 0: Probability = 0.8
    To State 2: Probability = 0.1
    To State 4: Probability = 0.1
  Action 3:
    To State 0: Probability = 0.1
    To State 1: Probability = 0.1
    To State 2: Probability = 0.8
State 3:
  Action 0:
    To State 0: Probability = 0.8
    To State 4: Probability = 0.1
    To State 6: Probability = 0.1
  Action 1:
    To State 0: Probability = 0.1
    To State 2: Probability = 0.1
    To State 6: Probability = 0.8
  Action 2:
    To State 2: Probability = 0.8
    To State 4: Probability = 0.1
    To State 6: Probability = 0.1
  Action 3:
    To State 0: Probability = 0.1
    To State 2: Probability = 0.1
    To State 4: Probability = 0.8
State 4:
  Action 0:
    To State 1: Probability = 0.8
    To State 5:

In [2]:

class PolicyEvaluator:
    def __init__(self, rewards, transitions, gamma, initial_policy=None):
        self.num_states = len(rewards)
        self.num_actions = len(transitions[0])
        self.rewards = rewards
        self.transitions = transitions
        self.gamma = gamma
        self.values = np.zeros(self.num_states)
        self.policy = (
            np.random.randint(0, self.num_actions, self.num_states)
            if initial_policy is None
            else initial_policy
        )

    def update_values(self):
        change = 0
        for state in range(self.num_states):
            previous = self.values[state]
            selected_action = self.policy[state]
            self.values[state] = self.rewards[state] + self.gamma * np.sum(
                self.transitions[state, selected_action] * self.values
            )
            change = max(change, abs(previous - self.values[state]))
        return change

    def evaluate_policy(self, tolerance=1e-3):
        for _ in range(100):
            if self.update_values() < tolerance:
                break

    def refine_policy(self):
        adjustments = 0
        for state in range(self.num_states):
            current_action = self.policy[state]
            action_values = [
                np.sum(self.transitions[state, a] * self.values) for a in range(self.num_actions)
            ]
            self.policy[state] = np.argmax(action_values)
            if self.policy[state] != current_action:
                adjustments += 1
        return adjustments

    def optimize(self, tolerance=1e-3, visualize=True):
        iterations = 0
        while iterations < 500:
            self.evaluate_policy(tolerance)
            if self.refine_policy() == 0:
                break
            iterations += 1

        if visualize:
            self.display_results()

    def display_results(self):
        print("Optimized Policy:")
        self.display_grid(self.policy, "Policy")
        print("Value Function:")
        self.display_grid(self.values, "Values")

    def display_grid(self, data, label):
        grid = 3
        symbols = ["\u2191", "\u2193", "\u2190", "\u2192"]
        print(f"{label}:")
        for i in range(grid):
            row = []
            for j in range(grid):
                idx = i * grid + j
                if data is self.policy:
                    row.append("T" if idx in [0, 2] else symbols[data[idx]])
                else:
                    row.append(f"{data[idx]:.2f}")
            print(" ".join(row))
        print()

rewards, transitions = setup_grid_environment(10)
policy_eval = PolicyEvaluator(rewards, transitions, gamma=0.99)
policy_eval.optimize()


Optimized Policy:
Policy:
T → T
↑ ↑ ↑
↑ ↑ ↑

Value Function:
Values:
10.00 8.78 10.00
8.33 7.38 8.29
6.89 6.14 6.94



In [3]:

class ValueOptimizer:
    def __init__(self, rewards, transitions, gamma, tolerance=1e-3):
        self.num_states = len(rewards)
        self.num_actions = len(transitions[0])
        self.rewards = rewards
        self.transitions = transitions
        self.gamma = gamma
        self.tolerance = tolerance
        self.values = np.zeros(self.num_states)
        self.policy = np.zeros(self.num_states, dtype=int)

    def iterate_values(self):
        max_change = 0
        for state in range(self.num_states):
            old_value = self.values[state]
            value_per_action = [
                self.rewards[state] + self.gamma * np.sum(self.transitions[state, a] * self.values)
                for a in range(self.num_actions)
            ]
            self.values[state] = max(value_per_action)
            max_change = max(max_change, abs(old_value - self.values[state]))
        return max_change

    def train(self, max_iterations=500):
        for _ in range(max_iterations):
            if self.iterate_values() < self.tolerance:
                break
        self.policy = self.derive_policy()
        self.display_results()

    def derive_policy(self):
        policy = np.zeros(self.num_states, dtype=int)
        for state in range(self.num_states):
            action_rewards = [
                self.rewards[state] + self.gamma * np.sum(self.transitions[state, a] * self.values)
                for a in range(self.num_actions)
            ]
            policy[state] = np.argmax(action_rewards)
        return policy

    def display_results(self):
        print("Final Policy:")
        self.display_grid(self.policy, "Policy")
        print("Value Function:")
 
    def display_grid(self, data, label):
        grid = 3
        symbols = ["\u2191", "\u2193", "\u2190", "\u2192"]
        print(f"{label}:")
        for i in range(grid):
            row = []
            for j in range(grid):
                idx = i * grid + j
                if data is self.policy:
                    row.append("T" if idx in [0, 2] else symbols[data[idx]])
                else:
                    row.append(f"{data[idx]:.2f}")
            print(" ".join(row))
        print()

reward_values = [100, 3, 0, -3]
for reward in reward_values:
    rewards, transitions = setup_grid_environment(reward)
    value_opt = ValueOptimizer(rewards, transitions, gamma=0.99)
    value_opt.train()

Final Policy:
Policy:
T ↓ T
↑ ← ←
↑ ← ↑

Value Function:
Final Policy:
Policy:
T → T
← → ↑
↑ ↑ ↑

Value Function:
Final Policy:
Policy:
T → T
← → ↑
↑ ↑ ↑

Value Function:
Final Policy:
Policy:
T → T
← → ↑
↑ ↑ ↑

Value Function:
