In [None]:
import numpy as np

In [None]:
class GridWorld:
    def __init__(self, size=4):
        self.size = size
        self.grid = np.zeros((size, size))
        self.grid[size-1, size-1] = 1  # Terminal state
        self.current_position = (0, 0)

    def reset(self):
        self.current_position = (0, 0)
        return self.current_position

    def step(self, action):
        if action == 0:  # Up
            self.current_position = (max(self.current_position[0] - 1, 0), self.current_position[1])
        elif action == 1:  # Down
            self.current_position = (min(self.current_position[0] + 1, self.size - 1), self.current_position[1])
        elif action == 2:  # Left
            self.current_position = (self.current_position[0], max(self.current_position[1] - 1, 0))
        elif action == 3:  # Right
            self.current_position = (self.current_position[0], min(self.current_position[1] + 1, self.size - 1))

        reward = 0
        if self.current_position == (self.size - 1, self.size - 1):  # Reached terminal state
            reward = 1
            done = True
        else:
            done = False

        return self.current_position, reward, done, None

In [None]:
class MonteCarloAgent:
    def __init__(self, num_states, num_actions, gamma=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.gamma = gamma
        self.Q = np.zeros((num_states, num_actions))
        self.returns_sum = np.zeros((num_states, num_actions))
        self.returns_count = np.ones((num_states, num_actions))  # Initialized to 1 to avoid division by zero

    def update_q_value(self, episode):
        G = 0
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            if (state, action) not in [(x[0], x[1]) for x in episode[:t]]:  # First visit
                self.returns_sum[state, action] += G
                self.returns_count[state, action] += 1
                self.Q[state, action] = self.returns_sum[state, action] / self.returns_count[state, action]

    def get_action(self, state, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.Q[state])


In [None]:
class TDAgent:
    def __init__(self, num_states, num_actions, alpha=0.1, gamma=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.Q = np.zeros((num_states, num_actions))

    def update_q_value(self, state, action, reward, next_state):
        old_value = self.Q[state, action]
        next_max = np.max(self.Q[next_state])
        new_value = old_value + self.alpha * (reward + self.gamma * next_max - old_value)
        self.Q[state, action] = new_value

    def get_action(self, state, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.Q[state])

In [None]:
# Training the agents
num_episodes = 100
epsilon = 0.1
max_steps_per_episode = 100  # Maximum number of steps per episode

env = GridWorld()
num_states = env.size ** 2
num_actions = 4

In [None]:
agent_mc = MonteCarloAgent(num_states, num_actions)
agent_td = TDAgent(num_states, num_actions)

In [None]:
for episode in range(num_episodes):
    state = env.reset()
    done = False

    # Run episode with Monte Carlo agent
    episode_mc = []  # Store (state, action, reward) tuples
    while not done:
        action = agent_mc.get_action(state[0] * env.size + state[1], epsilon)
        next_state, reward, done, _ = env.step(action)
        episode_mc.append((state[0] * env.size + state[1], action, reward))  # Flatten state
        state = next_state
    agent_mc.update_q_value(episode_mc)


In [None]:
for episode in range(num_episodes):
    state = env.reset()
    done = False
# Run episode with Temporal-Difference agent
    state = env.reset()
    done = False
    while not done:
        action = agent_td.get_action(state[0] * env.size + state[1], epsilon)  # Flatten state
        next_state, reward, done, _ = env.step(action)
        agent_td.update_q_value(state[0] * env.size + state[1], action, reward, next_state[0] * env.size + next_state[1])
        state = next_state

In [None]:
import matplotlib.pyplot as plt

def evaluate_agent(agent, env, num_episodes=100):
    total_rewards = []
    for _ in range(num_episodes):
        state = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action = agent.get_action(state[0] * env.size + state[1], epsilon=0)  # Greedy action
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        total_rewards.append(episode_reward)
    avg_reward = np.mean(total_rewards)
    return avg_reward

# Evaluate agents
avg_reward_mc = evaluate_agent(agent_mc, env)
avg_reward_td = evaluate_agent(agent_td, env)

print("Average reward for Monte Carlo agent:", avg_reward_mc)
print("Average reward for Temporal-Difference agent:", avg_reward_td)

Average reward for Monte Carlo agent: 1.0
Average reward for Temporal-Difference agent: 1.0


In [None]:
# Comparison of Policies
print("Monte Carlo agent's Q-values:")
print(agent_mc.Q)
print("Temporal-Difference agent's Q-values:")
print(agent_td.Q)

Monte Carlo agent's Q-values:
[[5.28797015e-003 3.56968197e-003 4.11969767e-003 1.90283322e-001]
 [5.24364339e-003 3.33626481e-001 1.67823474e-002 1.13356804e-002]
 [1.02260738e-003 1.71529412e-001 5.74198627e-005 5.83078385e-004]
 [1.14227977e-320 3.39097015e-319 5.89010835e-002 1.40072551e-319]
 [1.16535878e-002 0.00000000e+000 4.49819622e-006 9.01850075e-253]
 [7.38112500e-003 6.56100000e-002 1.19574225e-001 6.61013171e-001]
 [8.55782609e-002 3.07273500e-001 4.68642857e-001 7.73427706e-001]
 [8.43557143e-002 8.60419904e-001 2.35960327e-305 4.05000000e-001]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
 [7.29000000e-002 0.00000000e+000 0.00000000e+000 0.00000000e+000]
 [4.86000000e-001 1.96830000e-001 0.00000000e+000 0.00000000e+000]
 [3.07800000e-001 9.90291262e-001 4.92075000e-001 3.64500000e-001]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
 [2.18700000e-001 0.00000000e+00

In [None]:
# Comparison of State Values
print("Monte Carlo agent's learned state values:")
print(np.max(agent_mc.Q, axis=1))
print("Temporal-Difference agent's learned state values:")
print(np.max(agent_td.Q, axis=1))


Monte Carlo agent's learned state values:
[0.19028332 0.33362648 0.17152941 0.05890108 0.01165359 0.66101317
 0.77342771 0.8604199  0.         0.0729     0.486      0.99029126
 0.         0.         0.2187     0.        ]
Temporal-Difference agent's learned state values:
[0.59047601 0.65609677 0.72899934 0.8099999  0.22363273 0.16049023
 0.38913216 0.89999999 0.         0.         0.06230217 1.
 0.         0.         0.         0.        ]
