In [1]:
import numpy as np
from typing import Dict, Tuple, List
import matplotlib.pyplot as plt

In [2]:
class MDP:
    """Markov Decision Process class"""

    def __init__(self, states: List, actions: List, transitions: Dict,
                 rewards: Dict, gamma: float = 0.9):
    
        self.states = states
        self.actions = actions
        self.transitions = transitions
        self.rewards = rewards
        self.gamma = gamma
        self.n_states = len(states)
        self.n_actions = len(actions)

    def get_transition_prob(self, state, action, next_state):
        """Get transition probability P(s'|s,a)"""
        return self.transitions.get((state, action, next_state), 0.0)

    def get_reward(self, state, action, next_state):
        """Get reward R(s,a,s')"""
        return self.rewards.get((state, action, next_state), 0.0)

In [3]:
class PolicyEvaluationImprovement:
    """Policy Evaluation and Improvement algorithms"""

    def __init__(self, mdp: MDP):
        self.mdp = mdp

    def policy_evaluation(self, policy: Dict, theta: float = 1e-6,
                         max_iterations: int = 1000) -> np.ndarray:
        """
        Evaluate a policy using iterative policy evaluation

        Args:
            policy: Dict mapping state -> action
            theta: Convergence threshold
            max_iterations: Maximum iterations

        Returns:
            Value function V(s) for all states
        """
        V = np.zeros(self.mdp.n_states)

        for iteration in range(max_iterations):
            delta = 0
            V_new = np.zeros(self.mdp.n_states)

            for s_idx, state in enumerate(self.mdp.states):
                action = policy[state]
                v = 0

                # Calculate expected value
                for next_state in self.mdp.states:
                    s_next_idx = self.mdp.states.index(next_state)
                    prob = self.mdp.get_transition_prob(state, action, next_state)
                    reward = self.mdp.get_reward(state, action, next_state)
                    v += prob * (reward + self.mdp.gamma * V[s_next_idx])

                V_new[s_idx] = v
                delta = max(delta, abs(V_new[s_idx] - V[s_idx]))

            V = V_new.copy()

            if delta < theta:
                print(f"Policy evaluation converged in {iteration + 1} iterations")
                break

        return V

In [4]:
def policy_improvement(self, V: np.ndarray) -> Tuple[Dict, bool]:
  
    policy = {}
    policy_stable = True

    for s_idx, state in enumerate(self.mdp.states):
        old_action = None
        best_action = None
        best_value = float('-inf')

        # Find best action for this state
        for action in self.mdp.actions:
            action_value = 0

            for next_state in self.mdp.states:
                s_next_idx = self.mdp.states.index(next_state)
                prob = self.mdp.get_transition_prob(state, action, next_state)
                reward = self.mdp.get_reward(state, action, next_state)
                action_value += prob * (reward + self.mdp.gamma * V[s_next_idx])

            if action_value > best_value:
                best_value = action_value
                best_action = action

        policy[state] = best_action

    return policy, policy_stable

# Add this method to PolicyEvaluationImprovement class
PolicyEvaluationImprovement.policy_improvement = policy_improvement