<a href="https://colab.research.google.com/github/2303A51937/23CSBTB27-28/blob/main/Assignment3(R).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random
from collections import defaultdict

# -------------------------------
# Toy MDP Definition
# -------------------------------
states = [0, 1, 2, 3]   # terminal = 3
actions = [0, 1]        # 0 = left, 1 = right
gamma = 0.9

# Transition model: P[s][a] -> (next_state, reward, done)
P = {
    0: {0: (0, 0, False), 1: (1, 0, False)},
    1: {0: (0, 0, False), 1: (2, 0, False)},
    2: {0: (1, 0, False), 1: (3, 1, True)},   # reward when reaching state 3
    3: {0: (3, 0, True),  1: (3, 0, True)}    # terminal
}

# -------------------------------
# Environment Functions
# -------------------------------
def step(state, action):
    """Return next_state, reward, done given current state and action."""
    return P[state][action]

def generate_episode(policy):
    """Generate an episode following a given policy function."""
    episode = []
    state = 0  # start always from state 0
    done = False
    while not done:
        action = policy(state)
        next_state, reward, done = step(state, action)
        episode.append((state, action, reward))
        state = next_state
    return episode

# -------------------------------
# Monte Carlo Policy Evaluation
# -------------------------------
def mc_policy_evaluation(policy, episodes=5000, gamma=0.9):
    V = defaultdict(float)
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    for _ in range(episodes):
        episode = generate_episode(policy)
        G = 0
        visited = set()
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            if state not in visited:  # first-visit MC
                returns_sum[state] += G
                returns_count[state] += 1
                V[state] = returns_sum[state] / returns_count[state]
                visited.add(state)
    return dict(V)

# -------------------------------
# Monte Carlo Control (ε-greedy)
# -------------------------------
def mc_control_epsilon_greedy(episodes=10000, gamma=0.9, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(len(actions)))
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    def epsilon_greedy_policy(state):
        if random.random() < epsilon:
            return random.choice(actions)
        else:
            return np.argmax(Q[state])

    for _ in range(episodes):
        episode = []
        state = 0
        done = False
        while not done:
            action = epsilon_greedy_policy(state)
            next_state, reward, done = step(state, action)
            episode.append((state, action, reward))
            state = next_state

        # update Q-values
        G = 0
        visited = set()
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            if (state, action) not in visited:
                key = (state, action)
                returns_sum[key] += G
                returns_count[key] += 1
                Q[state][action] = returns_sum[key] / returns_count[key]
                visited.add((state, action))

    # derive greedy policy
    policy = {s: np.argmax(Q[s]) for s in Q.keys()}
    return dict(Q), policy

# -------------------------------
# Example Run
# -------------------------------
if __name__ == "__main__":
    # Random policy for evaluation
    def random_policy(state):
        return random.choice(actions)

    print("=== Monte Carlo Policy Evaluation ===")
    V = mc_policy_evaluation(random_policy, episodes=5000)
    for s in states:
        print(f"V({s}) = {V.get(s, 0):.3f}")

    print("\n=== Monte Carlo Control (ε-greedy) ===")
    Q, policy = mc_control_epsilon_greedy(episodes=20000, epsilon=0.1)
    for s in sorted(Q.keys()):
        print(f"State {s}: Q = {Q[s]}")
    print("Learned Policy:", policy)


=== Monte Carlo Policy Evaluation ===
V(0) = 0.759
V(1) = 0.900
V(2) = 1.000
V(3) = 0.000

=== Monte Carlo Control (ε-greedy) ===
State 0: Q = [0.71405064 0.80219979]
State 1: Q = [0.71891085 0.9       ]
State 2: Q = [0.80014693 1.        ]
Learned Policy: {0: np.int64(1), 1: np.int64(1), 2: np.int64(1)}
