In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

In [2]:
class GridWorld:
    """Simple grid world environment for TD(λ) learning."""

    def __init__(self, size=5, goal_state=(4, 4), obstacle_states=None):
        self.size = size
        self.goal_state = goal_state
        self.obstacle_states = obstacle_states if obstacle_states else [(2, 2)]
        self.current_state = (0, 0)
        self.actions = ['up', 'down', 'left', 'right']

    def reset(self):
        """Reset environment to starting state."""
        self.current_state = (0, 0)
        return self.current_state

    def step(self, action):
        """Take action and return next state, reward, and done flag."""
        row, col = self.current_state

        # Determine next state based on action
        if action == 'up':
            next_state = (max(0, row - 1), col)
        elif action == 'down':
            next_state = (min(self.size - 1, row + 1), col)
        elif action == 'left':
            next_state = (row, max(0, col - 1))
        elif action == 'right':
            next_state = (row, min(self.size - 1, col + 1))
        else:
            next_state = self.current_state

        # Check if next state is an obstacle
        if next_state in self.obstacle_states:
            next_state = self.current_state
            reward = -1
        elif next_state == self.goal_state:
            reward = 10
        else:
            reward = -0.1

        self.current_state = next_state
        done = (next_state == self.goal_state)

        return next_state, reward, done

    def get_state_index(self, state):
        """Convert 2D state to 1D index."""
        return state[0] * self.size + state[1]

    def get_state_from_index(self, index):
        """Convert 1D index to 2D state."""
        return (index // self.size, index % self.size)

In [3]:
class TDLambda:
    """TD(λ) algorithm with eligibility traces."""

    def __init__(self, env, alpha=0.1, gamma=0.95, lambda_param=0.8):
       
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.lambda_param = lambda_param

        # Initialize value function
        self.num_states = env.size * env.size
        self.V = np.zeros(self.num_states)

        # Initialize eligibility traces
        self.eligibility_traces = np.zeros(self.num_states)

        # Track history for plotting
        self.episode_rewards = []
        self.episode_lengths = []
        self.value_history = []

    def select_action(self, epsilon=0.1):
        """Epsilon-greedy action selection."""
        if np.random.random() < epsilon:
            return np.random.choice(self.env.actions)
        else:
            # Simple policy: move towards goal
            row, col = self.env.current_state
            goal_row, goal_col = self.env.goal_state

            if row < goal_row:
                return 'down'
            elif row > goal_row:
                return 'up'
            elif col < goal_col:
                return 'right'
            else:
                return 'left'

    def train_episode(self, epsilon=0.1, accumulating=True):
        """
        Train for one episode using TD(λ).

        Args:
            epsilon: Exploration rate
            accumulating: If True, use accumulating traces; else replacing traces
        """
        state = self.env.reset()
        self.eligibility_traces = np.zeros(self.num_states)

        episode_reward = 0
        steps = 0

        while steps < 100:  # Max steps per episode
            # Select and take action
            action = self.select_action(epsilon)
            next_state, reward, done = self.env.step(action)

            episode_reward += reward
            steps += 1

            # Get state indices
            s_idx = self.env.get_state_index(state)
            s_next_idx = self.env.get_state_index(next_state)

            # TD error
            td_error = reward + self.gamma * self.V[s_next_idx] - self.V[s_idx]

            # Update eligibility trace for current state
            if accumulating:
                self.eligibility_traces[s_idx] += 1
            else:
                # Replacing traces
                self.eligibility_traces[s_idx] = 1

            # Update all values using eligibility traces
            self.V += self.alpha * td_error * self.eligibility_traces

            # Decay eligibility traces
            self.eligibility_traces *= self.gamma * self.lambda_param

            if done:
                break

            state = next_state

        self.episode_rewards.append(episode_reward)
        self.episode_lengths.append(steps)
        self.value_history.append(self.V.copy())

    def train(self, num_episodes=500, epsilon=0.1):
        """Train for multiple episodes."""
        print(f"Training TD(λ) for {num_episodes} episodes...")
        print(f"Parameters: α={self.alpha}, γ={self.gamma}, λ={self.lambda_param}")

        for episode in range(num_episodes):
            self.train_episode(epsilon)

            if (episode + 1) % 100 == 0:
                avg_reward = np.mean(self.episode_rewards[-100:])
                avg_length = np.mean(self.episode_lengths[-100:])
                print(f"Episode {episode + 1}: Avg Reward = {avg_reward:.2f}, Avg Length = {avg_length:.1f}")

    def get_value_grid(self):
        """Convert value function to 2D grid."""
        grid = np.zeros((self.env.size, self.env.size))
        for i in range(self.num_states):
            state = self.env.get_state_from_index(i)
            grid[state[0], state[1]] = self.V[i]
        return grid