In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
class GridWorld:
    """Simple grid world environment for TD learning."""

    def __init__(self, size=5):
        self.size = size
        self.goal = (size-1, size-1)
        self.start = (0, 0)
        self.reset()

    def reset(self):
        """Reset environment to starting state."""
        self.position = self.start
        return self.position

    def step(self, action):
        """
        Take an action and return next state, reward, and done flag.
        Actions: 0=up, 1=right, 2=down, 3=left
        """
        x, y = self.position

        # Update position based on action
        if action == 0 and x > 0:  # up
            x -= 1
        elif action == 1 and y < self.size - 1:  # right
            y += 1
        elif action == 2 and x < self.size - 1:  # down
            x += 1
        elif action == 3 and y > 0:  # left
            y -= 1

        self.position = (x, y)

        # Reward structure
        if self.position == self.goal:
            reward = 10.0
            done = True
        else:
            reward = -0.1  # Small penalty for each step
            done = False

        return self.position, reward, done

    def get_possible_actions(self):
        """Return list of possible actions from current state."""
        return [0, 1, 2, 3]

In [3]:
class TD0Agent:
    """Agent using TD(0) algorithm for value estimation."""

    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
      
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.V = defaultdict(float)  # State value function
        self.visit_counts = defaultdict(int)

    def get_value(self, state):
        """Get value of a state."""
        return self.V[state]

In [4]:
def choose_action(self, env):
    """Choose action using epsilon-greedy policy based on value estimates."""
    if np.random.random() < self.epsilon:
        # Explore: random action
        return np.random.choice(env.get_possible_actions())
    else:
        # Exploit: choose action that leads to highest value state
        return self.greedy_action(env)

def greedy_action(self, env):
    """Choose greedy action based on value function."""
    current_pos = env.position
    actions = env.get_possible_actions()
    best_action = actions[0]
    best_value = float('-inf')

    for action in actions:
        # Simulate next state
        next_state = self.simulate_next_state(env.position, action, env.size)
        value = self.get_value(next_state)

        if value > best_value:
            best_value = value
            best_action = action

    return best_action

def simulate_next_state(self, position, action, size):
    """Simulate what the next state would be given an action."""
    x, y = position

    if action == 0 and x > 0:  # up
        x -= 1
    elif action == 1 and y < size - 1:  # right
        y += 1
    elif action == 2 and x < size - 1:  # down
        x += 1
    elif action == 3 and y > 0:  # left
        y -= 1

    return (x, y)

# Add these methods to TD0Agent class
TD0Agent.choose_action = choose_action
TD0Agent.greedy_action = greedy_action
TD0Agent.simulate_next_state = simulate_next_state