In [2]:
import numpy as np
import random

class GridWorld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.state = (0, 0)
        self.goal = (3, 3)
        self.actions = [0, 1, 2, 3] # Up, Right, Down, Left

    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0:   x = max(0, x - 1)
        elif action == 1: y = min(self.cols - 1, y + 1)
        elif action == 2: x = min(self.rows - 1, x + 1)
        elif action == 3: y = max(0, y - 1)
        
        self.state = (x, y)
        if self.state == self.goal: return self.state, 10, True
        else: return self.state, -1, False

def run_monte_carlo():
    env = GridWorld()
    V = np.zeros((4, 4))
    returns = {(i, j): [] for i in range(4) for j in range(4)}
    episodes = 1000
    gamma = 0.9

    print("Running Monte Carlo Policy Evaluation...")
    for _ in range(episodes):
        state = env.reset()
        episode = []
        done = False
        
        # Generate Episode
        while not done:
            action = random.choice(env.actions) # Random Policy
            next_state, reward, done = env.step(action)
            episode.append((state, reward))
            state = next_state
        
        # Calculate Returns
        G = 0
        for state, reward in reversed(episode):
            G = reward + gamma * G
            returns[state].append(G)
            # Update Value with average return
            V[state] = np.mean(returns[state])

    print("Estimated Value Function (V):\n", np.round(V, 2))

if __name__ == "__main__":
    run_monte_carlo()

Running Monte Carlo Policy Evaluation...
Estimated Value Function (V):
 [[-8.56 -8.17 -7.56 -7.04]
 [-8.22 -7.6  -6.57 -5.27]
 [-7.66 -6.57 -3.94 -0.31]
 [-7.14 -5.48 -0.16  0.  ]]


In [1]:
# exp1_mc_policy_evaluation.py

import numpy as np
import random
from collections import defaultdict

class GridWorld4x4:
    def __init__(self, start_state=0, goal_state=15, max_steps=100):
        self.n_rows = 4
        self.n_cols = 4
        self.n_states = self.n_rows * self.n_cols
        self.n_actions = 4  # 0:up,1:right,2:down,3:left
        self.start_state = start_state
        self.goal_state = goal_state
        self.max_steps = max_steps
        self.reset()

    def state_to_xy(self, s):
        return (s // self.n_cols, s % self.n_cols)

    def xy_to_state(self, r, c):
        return r * self.n_cols + c

    def reset(self):
        self.state = self.start_state
        self.steps = 0
        return self.state

    def step(self, action):
        r, c = self.state_to_xy(self.state)
        if action == 0:  # up
            r = max(0, r - 1)
        elif action == 1:  # right
            c = min(self.n_cols - 1, c + 1)
        elif action == 2:  # down
            r = min(self.n_rows - 1, r + 1)
        elif action == 3:  # left
            c = max(0, c - 1)
        next_state = self.xy_to_state(r, c)
        self.state = next_state
        self.steps += 1

        done = (self.state == self.goal_state) or (self.steps >= self.max_steps)
        reward = 0 if self.state == self.goal_state else -1
        return self.state, reward, done, {}

    def render(self):
        grid = np.full((self.n_rows, self.n_cols), '.')
        r, c = self.state_to_xy(self.state)
        gr, gc = self.state_to_xy(self.goal_state)
        grid[gr, gc] = 'G'
        grid[r, c] = 'A'
        print('\n'.join(' '.join(row) for row in grid))
        print()


def random_policy(env, s):
    return random.randrange(env.n_actions)


def mc_policy_eval(env, policy_fn, num_episodes=5000, gamma=1.0):
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    V = np.zeros(env.n_states)

    for _ in range(num_episodes):
        episode = []
        s = env.reset()
        done = False
        while not done:
            a = policy_fn(env, s)
            ns, r, done, _ = env.step(a)
            episode.append((s, a, r))
            s = ns

        G = 0.0
        visited_states = set()
        for t in range(len(episode) - 1, -1, -1):
            s_t, a_t, r_t = episode[t]
            G = gamma * G + r_t
            if s_t not in visited_states:
                visited_states.add(s_t)
                returns_sum[s_t] += G
                returns_count[s_t] += 1.0
                V[s_t] = returns_sum[s_t] / returns_count[s_t]

    return V


if __name__ == "__main__":
    env = GridWorld4x4()
    V = mc_policy_eval(env, random_policy, num_episodes=5000, gamma=1.0)
    print("State-value function V(s) under random policy:")
    print(np.round(V.reshape(4, 4), 2))


State-value function V(s) under random policy:
[[-22.88 -17.94 -17.1  -19.27]
 [-18.28 -13.72 -12.27 -14.17]
 [-17.34 -12.56 -10.08  -9.49]
 [-19.57 -14.76  -9.65   0.  ]]
