In [2]:
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

In [3]:
class GridWorld:
    """Simple GridWorld environment for RL"""
    def __init__(self, size=5):
        self.size = size
        self.start = (0, 0)
        self.goal = (size-1, size-1)
        self.state = self.start
        self.actions = ['up', 'down', 'left', 'right']

    def reset(self):
        self.state = self.start
        return self.state

    def step(self, action):
        x, y = self.state

        # Action effects
        if action == 'up':
            x = max(0, x - 1)
        elif action == 'down':
            x = min(self.size - 1, x + 1)
        elif action == 'left':
            y = max(0, y - 1)
        elif action == 'right':
            y = min(self.size - 1, y + 1)

        self.state = (x, y)

        # Reward structure
        if self.state == self.goal:
            reward = 10
            done = True
        else:
            reward = -1
            done = False

        return self.state, reward, done

    def get_action_index(self, action):
        return self.actions.index(action)

In [4]:
class TDZeroAgent:
    """Temporal Difference TD(0) Learning Agent"""
    def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.env = env
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.V = defaultdict(float)  # State values

    def choose_action(self):
        """Random policy for TD(0) value estimation"""
        return np.random.choice(self.env.actions)

    def train(self, episodes=1000):
        """Train using TD(0) bootstrapping"""
        episode_rewards = []

        for episode in range(episodes):
            state = self.env.reset()
            total_reward = 0

            while True:
                action = self.choose_action()
                next_state, reward, done = self.env.step(action)
                total_reward += reward

                # TD(0) Update: V(s) <- V(s) + α[r + γV(s') - V(s)]
                td_target = reward + self.gamma * self.V[next_state]
                td_error = td_target - self.V[state]
                self.V[state] += self.alpha * td_error

                if done:
                    break

                state = next_state

            episode_rewards.append(total_reward)

        return episode_rewards

In [5]:
class SARSAAgent:
    """SARSA (On-Policy TD Control) Agent"""
    def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = defaultdict(lambda: np.zeros(len(env.actions)))

    def choose_action(self, state):
        """Epsilon-greedy action selection"""
        if np.random.random() < self.epsilon:
            return np.random.choice(self.env.actions)
        else:
            action_values = self.Q[state]
            return self.env.actions[np.argmax(action_values)]

    def train(self, episodes=1000):
        """Train using SARSA bootstrapping"""
        episode_rewards = []

        for episode in range(episodes):
            state = self.env.reset()
            action = self.choose_action(state)
            total_reward = 0

            while True:
                next_state, reward, done = self.env.step(action)
                total_reward += reward
                next_action = self.choose_action(next_state)

                # SARSA Update: Q(s,a) <- Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
                action_idx = self.env.get_action_index(action)
                next_action_idx = self.env.get_action_index(next_action)

                td_target = reward + self.gamma * self.Q[next_state][next_action_idx]
                td_error = td_target - self.Q[state][action_idx]
                self.Q[state][action_idx] += self.alpha * td_error

                if done:
                    break

                state = next_state
                action = next_action

            episode_rewards.append(total_reward)

        return episode_rewards