<a href="https://colab.research.google.com/github/2303a51690/23CSBTB27-28/blob/main/RL(lab_1_MDPs).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# MDP Setup
states = [0, 1, 2, 3]  # Four states
actions = ['a', 'b']   # Two possible actions

# Transition model: T[s][a] = list of (probability, next_state, reward)
T = {
    0: {'a': [(1.0, 1, 0)],     'b': [(1.0, 2, 0)]},
    1: {'a': [(1.0, 0, 0)],     'b': [(1.0, 3, 1)]},
    2: {'a': [(1.0, 3, 1)],     'b': [(1.0, 0, 0)]},
    3: {'a': [(1.0, 3, 0)],     'b': [(1.0, 3, 0)]}  # Terminal state
}

gamma = 0.9  # Discount factor


In [2]:
def value_iteration(states, actions, T, gamma=0.9, theta=1e-6):
    V = np.zeros(len(states))  # Initialize value function
    while True:
        delta = 0
        for s in states:
            v = V[s]
            V[s] = max(
                sum(p * (r + gamma * V[s_]) for p, s_, r in T[s][a])
                for a in actions
            )
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    # Derive policy from value function
    policy = {}
    for s in states:
        best_a = max(actions, key=lambda a: sum(p * (r + gamma * V[s_]) for p, s_, r in T[s][a]))
        policy[s] = best_a

    return V, policy

# Run value iteration
V_vi, policy_vi = value_iteration(states, actions, T, gamma)
print("Value Iteration:")
print("Values:", V_vi)
print("Policy:", policy_vi)


Value Iteration:
Values: [0.9 1.  1.  0. ]
Policy: {0: 'a', 1: 'b', 2: 'a', 3: 'a'}


In [3]:
def policy_iteration(states, actions, T, gamma=0.9):
    # Initialize arbitrary policy
    policy = {s: np.random.choice(actions) for s in states}
    V = np.zeros(len(states))

    while True:
        # Policy Evaluation
        while True:
            delta = 0
            for s in states:
                v = V[s]
                a = policy[s]
                V[s] = sum(p * (r + gamma * V[s_]) for p, s_, r in T[s][a])
                delta = max(delta, abs(v - V[s]))
            if delta < 1e-6:
                break

        # Policy Improvement
        policy_stable = True
        for s in states:
            old_action = policy[s]
            policy[s] = max(actions, key=lambda a: sum(p * (r + gamma * V[s_]) for p, s_, r in T[s][a]))
            if old_action != policy[s]:
                policy_stable = False

        if policy_stable:
            break

    return V, policy

# Run policy iteration
V_pi, policy_pi = policy_iteration(states, actions, T, gamma)
print("\nPolicy Iteration:")
print("Values:", V_pi)
print("Policy:", policy_pi)



Policy Iteration:
Values: [0.9 1.  1.  0. ]
Policy: {0: 'a', 1: 'b', 2: 'a', 3: 'a'}
