In [1]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, Optional

In [2]:
class MDP:
    """
    Markov Decision Process implementation with Value Iteration solver.
    """

    def __init__(self, states: List[int], actions: List[int],
                 transitions: Dict, rewards: Dict, gamma: float = 0.9):

        self.states = states
        self.actions = actions
        self.transitions = transitions
        self.rewards = rewards
        self.gamma = gamma
        self.n_states = len(states)
        self.n_actions = len(actions)

    def get_transition_prob(self, s: int, a: int, s_next: int) -> float:
        """Get transition probability P(s'|s,a)"""
        return self.transitions.get((s, a, s_next), 0.0)

    def get_reward(self, s: int, a: int, s_next: int) -> float:
        """Get reward R(s,a,s')"""
        return self.rewards.get((s, a, s_next), 0.0)

In [4]:
def value_iteration(self, theta: float = 1e-6, max_iterations: int = 1000) -> Tuple[np.ndarray, np.ndarray, List[float]]:
      
        V = np.zeros(self.n_states)
        deltas = []

        for iteration in range(max_iterations):
            delta = 0
            V_new = np.zeros(self.n_states)

            # Update value for each state
            for s in self.states:
                # Compute Q-values for all actions
                q_values = []
                for a in self.actions:
                    q_value = 0
                    for s_next in self.states:
                        prob = self.get_transition_prob(s, a, s_next)
                        reward = self.get_reward(s, a, s_next)
                        q_value += prob * (reward + self.gamma * V[s_next])
                    q_values.append(q_value)

                # Bellman optimality update
                V_new[s] = max(q_values)
                delta = max(delta, abs(V_new[s] - V[s]))

            V = V_new
            deltas.append(delta)

            # Check convergence
            if delta < theta:
                print(f"Value iteration converged after {iteration + 1} iterations")
                break

        # Extract optimal policy
        policy = self.extract_policy(V)

        return V, policy, deltas