In [1]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, Optional

In [2]:
class MDP:
    """
    Markov Decision Process implementation with Value Iteration solver.
    """

    def __init__(self, states: List[int], actions: List[int],
                 transitions: Dict, rewards: Dict, gamma: float = 0.9):

        self.states = states
        self.actions = actions
        self.transitions = transitions
        self.rewards = rewards
        self.gamma = gamma
        self.n_states = len(states)
        self.n_actions = len(actions)

    def get_transition_prob(self, s: int, a: int, s_next: int) -> float:
        """Get transition probability P(s'|s,a)"""
        return self.transitions.get((s, a, s_next), 0.0)

    def get_reward(self, s: int, a: int, s_next: int) -> float:
        """Get reward R(s,a,s')"""
        return self.rewards.get((s, a, s_next), 0.0)

In [4]:
def value_iteration(self, theta: float = 1e-6, max_iterations: int = 1000) -> Tuple[np.ndarray, np.ndarray, List[float]]:
      
        V = np.zeros(self.n_states)
        deltas = []

        for iteration in range(max_iterations):
            delta = 0
            V_new = np.zeros(self.n_states)

            # Update value for each state
            for s in self.states:
                # Compute Q-values for all actions
                q_values = []
                for a in self.actions:
                    q_value = 0
                    for s_next in self.states:
                        prob = self.get_transition_prob(s, a, s_next)
                        reward = self.get_reward(s, a, s_next)
                        q_value += prob * (reward + self.gamma * V[s_next])
                    q_values.append(q_value)

                # Bellman optimality update
                V_new[s] = max(q_values)
                delta = max(delta, abs(V_new[s] - V[s]))

            V = V_new
            deltas.append(delta)

            # Check convergence
            if delta < theta:
                print(f"Value iteration converged after {iteration + 1} iterations")
                break

        # Extract optimal policy
        policy = self.extract_policy(V)

        return V, policy, deltas

In [5]:
def extract_policy(self, V: np.ndarray) -> np.ndarray:
        """
        Extract policy from value function.

        Args:
            V: Value function

        Returns:
            policy: Optimal action for each state
        """
        policy = np.zeros(self.n_states, dtype=int)

        for s in self.states:
            q_values = []
            for a in self.actions:
                q_value = 0
                for s_next in self.states:
                    prob = self.get_transition_prob(s, a, s_next)
                    reward = self.get_reward(s, a, s_next)
                    q_value += prob * (reward + self.gamma * V[s_next])
                q_values.append(q_value)

            policy[s] = np.argmax(q_values)

        return policy

In [6]:
def simulate_episode(self, policy: np.ndarray, start_state: int,
                        max_steps: int = 100) -> Tuple[List[int], List[int], List[float]]:
    
        states = [start_state]
        actions = []
        rewards = []

        current_state = start_state

        for _ in range(max_steps):
            # Select action according to policy
            action = policy[current_state]
            actions.append(action)

            # Sample next state based on transition probabilities
            next_state_probs = []
            next_states = []
            for s_next in self.states:
                prob = self.get_transition_prob(current_state, action, s_next)
                if prob > 0:
                    next_state_probs.append(prob)
                    next_states.append(s_next)

            if not next_states:
                break

            # Normalize probabilities
            next_state_probs = np.array(next_state_probs)
            next_state_probs /= next_state_probs.sum()

            # Sample next state
            next_state = np.random.choice(next_states, p=next_state_probs)

            # Get reward
            reward = self.get_reward(current_state, action, next_state)
            rewards.append(reward)

            states.append(next_state)
            current_state = next_state

        return states, actions, rewards