<a href="https://colab.research.google.com/github/BigBroCat/Lab/blob/main/Monte_Carlo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np

# Parameters
gamma = 0.9  # Discount factor
alpha = 0.1  # Learning rate
tolerance = 1e-3  # Convergence tolerance
max_episodes = 1000  # Maximum number of episodes
states = ["A", "B", "C", "D", "E"]  # Non-terminal states
terminal_states = ["Terminal (Left)", "Terminal (Right)"]
all_states = terminal_states[:1] + states + terminal_states[1:]
true_values = [0.0, 1/6, 2/6, 3/6, 4/6, 5/6, 1.0]  # Ground-truth values for RMS calculation

# Initialize value function
V = {state: 0.0 for state in all_states}

# Random policy: Transition probabilities are equal
def random_walk(state):
    """Simulates a random walk from a given state."""
    if state in terminal_states:
        return state, 0  # Terminal state, no reward

    idx = all_states.index(state)
    next_state = np.random.choice([all_states[idx - 1], all_states[idx + 1]])
    reward = 0 if next_state not in terminal_states else (1 if next_state == "Terminal (Right)" else 0)
    return next_state, reward

# Monte Carlo Every-Visit Learning Algorithm
def monte_carlo_learning(alpha, gamma, max_episodes, tolerance):
    """Monte Carlo Every-Visit Learning for Value Function Estimation."""
    global V
    rms_errors = []  # Track RMS error for each episode

    for episode in range(max_episodes):
        # Generate an episode
        episode_data = []  # Stores (state, reward) pairs
        current_state = "C"  # Start from the middle state
        while current_state not in terminal_states:
            next_state, reward = random_walk(current_state)
            episode_data.append((current_state, reward))
            current_state = next_state
        episode_data.append((current_state, 0))  # Add terminal state

        # Compute returns G_t in reverse order and update V
        G = 0
        visited = set()  # To ensure every-visit
        for state, reward in reversed(episode_data):
            G = reward + gamma * G
            if state not in visited and state not in terminal_states:
                visited.add(state)
                V[state] += alpha * (G - V[state])  # Every-visit update

        # Compute RMS error for this episode
        rms_error = compute_rms_error(V, true_values)
        rms_errors.append(rms_error)

        # Convergence check: Stop if RMS error change is small
        if len(rms_errors) > 1 and abs(rms_errors[-1] - rms_errors[-2]) < tolerance:
            break

    return V, rms_errors

# Compute RMS Error
def compute_rms_error(V, true_values):
    """Calculate RMS error compared to true state values."""
    errors = [
        (V[states[i]] - true_values[i + 1]) ** 2  # Skip terminal states
        for i in range(len(states))
    ]
    return np.sqrt(np.mean(errors))

# Run Monte Carlo Learning
V, rms_errors = monte_carlo_learning(alpha, gamma, max_episodes, tolerance)

# Display Final Value Function
print("Final Estimated Value Function:")
for state in states:
    print(f"{state}: {V[state]:.4f}")



Final Estimated Value Function:
A: 0.0728
B: 0.2758
C: 0.5192
D: 0.6758
E: 0.8497
