In [7]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import random
import seaborn as sns
from gym import spaces
import time
from IPython.display import clear_output
import os
from PIL import Image
import imageio
from matplotlib.animation import FuncAnimation, PillowWriter

In [8]:
os.makedirs('output/gifs', exist_ok=True)
os.makedirs('output/plots', exist_ok=True)
os.makedirs('output/models', exist_ok=True)

print("✅ Libraries loaded successfully!")
print("📁 Output directories created: output/gifs, output/plots, output/models")

✅ Libraries loaded successfully!
📁 Output directories created: output/gifs, output/plots, output/models


In [9]:
def create_gridworld(height=5, width=5, start=(0,0)):
    """
    Create a generic GridWorld environment with customizable dimensions.
    
    Args:
        height: Grid height
        width: Grid width
        start: Starting position tuple (row, col)
    
    Returns:
        env: Dictionary containing environment configuration
    """
    
    # Generate two distinct goal positions
    goal1 = (random.randint(0, height-1), random.randint(0, width-1))
    while True:
        goal2 = (random.randint(0, height-1), random.randint(0, width-1))
        if goal2 != goal1:
            break

    # Generate obstacle that doesn't collide with start or goals
    while True:
        obstacle = (random.randint(0, height-1), random.randint(0, width-1))
        if obstacle not in [goal1, goal2, start]:
            break
    
    env = {
        "height": height,
        "width": width,
        "start": np.array(start, dtype=int),
        "goal1": np.array(goal1, dtype=int),
        "goal2": np.array(goal2, dtype=int),
        "agent_pos": np.array(start, dtype=int),
        "obstacle": np.array(obstacle, dtype=int),
        "observation_space": spaces.MultiDiscrete([height, width]),
        "action_space": spaces.Discrete(4)
    }
    
    return env

In [10]:
def reset(env, randomize=False):
    """
    Reset the environment.
    
    Args:
        env: Environment dictionary
        randomize: If True, regenerate goals and obstacle positions
    """
    if randomize:
        height = env["height"]
        width = env["width"]
        start = tuple(env["start"])
        
        # Regenerate goals
        goal1 = (random.randint(0, height-1), random.randint(0, width-1))
        while True:
            goal2 = (random.randint(0, height-1), random.randint(0, width-1))
            if goal2 != goal1:
                break
        
        # Regenerate obstacle
        while True:
            obstacle = (random.randint(0, height-1), random.randint(0, width-1))
            if obstacle not in [goal1, goal2, start]:
                break
        
        env["goal1"] = np.array(goal1, dtype=int)
        env["goal2"] = np.array(goal2, dtype=int)
        env["obstacle"] = np.array(obstacle, dtype=int)
    
    env["agent_pos"] = env["start"].copy()
    return env["agent_pos"]


In [11]:
def step(env, action):
    """
    Execute action in the environment.
    
    Actions: 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT
    """
    r, c = env["agent_pos"]
    height = env["height"]
    width = env["width"]

    # Proposed new position
    new_r, new_c = r, c
    if action == 0:      # UP
        new_r = max(0, r - 1)
    elif action == 1:    # RIGHT
        new_c = min(width - 1, c + 1)
    elif action == 2:    # DOWN
        new_r = min(height - 1, r + 1)
    elif action == 3:    # LEFT
        new_c = max(0, c - 1)

    proposed_pos = np.array([new_r, new_c])

    # Check obstacle
    if np.array_equal(proposed_pos, env["obstacle"]):
        reward = -5
        done = False
        new_pos = env["agent_pos"].copy()
    # Check goals
    elif np.array_equal(proposed_pos, env["goal1"]) or np.array_equal(proposed_pos, env["goal2"]):
        reward = 10
        done = True
        new_pos = proposed_pos
    else:
        reward = -1
        done = False
        new_pos = proposed_pos

    env["agent_pos"] = new_pos
    return new_pos, reward, done


In [12]:
def render_plot(env, ax=None, title="GridWorld Environment"):
    """Render the grid with agent, goals, and obstacle."""
    height = env["height"]
    width = env["width"]
    grid = np.zeros((height, width))

    # Get positions
    r, c = env["agent_pos"]
    g1r, g1c = env["goal1"]
    g2r, g2c = env["goal2"]
    orr, orc = env["obstacle"]

    # Assign values
    grid[orr, orc] = 3
    grid[g1r, g1c] = 2
    grid[g2r, g2c] = 2
    grid[r, c] = 1

    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 8))
    
    ax.clear()
    ax.imshow(grid, cmap="Pastel1", origin="upper")
    ax.set_xticks(range(width))
    ax.set_yticks(range(height))
    ax.grid(True, which="both", color="black", linewidth=1)
    ax.set_title(title, fontsize=14, fontweight='bold')
    
    return ax

In [13]:
def monte_carlo_train(height=5, width=5, episodes=10000, gamma=0.95, 
                     epsilon=0.3, epsilon_decay=0.995, epsilon_min=0.01, 
                     max_steps=200, save_gif=True, gif_episodes=[1, 100, 500, 1000, 5000, 10000]):
    """
    Monte Carlo training with dynamic environment regeneration each episode.
    
    Args:
        height: Grid height
        width: Grid width
        episodes: Number of training episodes
        gamma: Discount factor
        epsilon: Initial exploration rate
        epsilon_decay: Decay rate for epsilon
        epsilon_min: Minimum epsilon value
        max_steps: Maximum steps per episode
        save_gif: Whether to save GIF animations
        gif_episodes: List of episode numbers to capture for GIF
    """
    
    print("🚀 Starting Monte Carlo Training")
    print(f"Grid Size: {height}×{width} | Episodes: {episodes} | γ: {gamma}")
    print("-" * 70)
    
    # Initialize environment
    env = create_gridworld(height=height, width=width)
    n_actions = env["action_space"].n
    
    # Q-table and returns
    Q = np.zeros((height, width, n_actions))
    returns = {(s, a): [] for s in [(i, j) for i in range(height) for j in range(width)] 
               for a in range(n_actions)}
    
    # Tracking metrics
    rewards_per_episode = []
    steps_per_episode = []
    epsilon_history = []
    success_rate = []
    
    # GIF frames storage
    gif_frames = {ep: [] for ep in gif_episodes}
    
    for ep in range(1, episodes + 1):
        # Reset with NEW random environment
        reset(env, randomize=True)
        
        episode = []
        done = False
        steps = 0
        
        # Capture frames for GIF
        if save_gif and ep in gif_episodes:
            fig_temp, ax_temp = plt.subplots(figsize=(6, 6))
        
        while not done and steps < max_steps:
            s = tuple(env["agent_pos"])
            
            # ε-greedy policy
            if np.random.rand() < epsilon:
                a = np.random.randint(n_actions)
            else:
                a = np.argmax(Q[s[0], s[1]])
            
            new_state, reward, done = step(env, a)
            episode.append((s, a, reward))
            steps += 1
            
            # Save frame for GIF - FIXED VERSION
            if save_gif and ep in gif_episodes:
                render_plot(env, ax=ax_temp, title=f"Episode {ep} - Step {steps}")
                fig_temp.canvas.draw()
                
                # Use the modern method to get canvas buffer
                buf = fig_temp.canvas.buffer_rgba()
                frame = np.asarray(buf)
                
                # Convert RGBA to RGB
                frame = frame[:, :, :3]
                gif_frames[ep].append(frame)
        
        if save_gif and ep in gif_episodes:
            plt.close(fig_temp)
        
        # Decay epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        epsilon_history.append(epsilon)
        
        # Compute returns (first-visit MC)
        G = 0
        visited = set()
        for (s, a, r) in reversed(episode):
            G = gamma * G + r
            if (s, a) not in visited:
                returns[(s, a)].append(G)
                Q[s[0], s[1], a] = np.mean(returns[(s, a)])
                visited.add((s, a))
        
        # Track metrics
        total_reward = sum([r for (_, _, r) in episode])
        rewards_per_episode.append(total_reward)
        steps_per_episode.append(steps)
        
        # Success tracking
        success = 1 if done and total_reward > 0 else 0
        if len(success_rate) == 0:
            success_rate.append(success)
        else:
            success_rate.append(0.99 * success_rate[-1] + 0.01 * success)
        
        # Progress logging
        if ep % 500 == 0:
            recent_avg = np.mean(rewards_per_episode[-100:])
            recent_success = success_rate[-1] * 100
            avg_steps = np.mean(steps_per_episode[-100:])
            print(f"Ep {ep:5d} | Avg Reward: {recent_avg:6.2f} | " +
                  f"Success: {recent_success:5.1f}% | " +
                  f"Avg Steps: {avg_steps:5.1f} | ε: {epsilon:.4f}")
    
    # Save GIFs
    if save_gif:
        print("\n🎬 Generating GIF animations...")
        for ep in gif_episodes:
            if gif_frames[ep]:
                imageio.mimsave(f'output/gifs/monte_carlo_episode_{ep}.gif', 
                              gif_frames[ep], fps=5)
                print(f"✅ Saved: monte_carlo_episode_{ep}.gif")
    
    # Save plots
    print("\n📊 Saving performance plots...")
    results = {
        'rewards': rewards_per_episode,
        'steps': steps_per_episode,
        'success_rate': success_rate,
        'epsilon': epsilon_history
    }
    save_training_plots(results, algorithm='MonteCarlo', grid_size=f"{height}x{width}")
    
    # Save Q-table
    np.save('output/models/monte_carlo_Q_table.npy', Q)
    print("✅ Saved: monte_carlo_Q_table.npy")
    
    # Final stats
    print("\n" + "=" * 70)
    print("🎉 Monte Carlo Training Completed!")
    print("=" * 70)
    final_avg = np.mean(rewards_per_episode[-100:])
    final_success = success_rate[-1] * 100
    final_steps = np.mean(steps_per_episode[-100:])
    print(f"Final Avg Reward (last 100): {final_avg:.2f}")
    print(f"Final Success Rate: {final_success:.1f}%")
    print(f"Final Avg Steps: {final_steps:.1f}")
    print(f"Final Epsilon: {epsilon:.4f}")
    
    return Q, results, env

In [14]:
def q_learning_train(height=5, width=5, episodes=10000, gamma=0.95, 
                    alpha=0.1, epsilon=0.3, epsilon_decay=0.995, 
                    epsilon_min=0.01, max_steps=200, save_gif=True, 
                    gif_episodes=[1, 100, 500, 1000, 5000, 10000]):
    """
    Q-Learning training with dynamic environment regeneration each episode.
    
    Args:
        height: Grid height
        width: Grid width
        episodes: Number of training episodes
        gamma: Discount factor
        alpha: Learning rate
        epsilon: Initial exploration rate
        epsilon_decay: Decay rate for epsilon
        epsilon_min: Minimum epsilon value
        max_steps: Maximum steps per episode
        save_gif: Whether to save GIF animations
        gif_episodes: List of episode numbers to capture for GIF
    """
    
    print("🚀 Starting Q-Learning Training")
    print(f"Grid Size: {height}×{width} | Episodes: {episodes} | γ: {gamma} | α: {alpha}")
    print("-" * 70)
    
    # Initialize environment
    env = create_gridworld(height=height, width=width)
    n_actions = env["action_space"].n
    
    # Q-table
    Q = np.zeros((height, width, n_actions))
    
    # Tracking metrics
    rewards_per_episode = []
    steps_per_episode = []
    epsilon_history = []
    success_rate = []
    
    # GIF frames storage
    gif_frames = {ep: [] for ep in gif_episodes}
    
    for ep in range(1, episodes + 1):
        # Reset with NEW random environment
        reset(env, randomize=True)
        
        done = False
        steps = 0
        total_reward = 0
        
        # Capture frames for GIF
        if save_gif and ep in gif_episodes:
            fig_temp, ax_temp = plt.subplots(figsize=(6, 6))
        
        while not done and steps < max_steps:
            s = tuple(env["agent_pos"])
            
            # ε-greedy action selection
            if np.random.rand() < epsilon:
                a = np.random.randint(n_actions)
            else:
                a = np.argmax(Q[s[0], s[1]])
            
            # Take action
            new_state, reward, done = step(env, a)
            s_next = tuple(env["agent_pos"])
            total_reward += reward
            steps += 1
            
            # Q-Learning update
            best_next_action = np.argmax(Q[s_next[0], s_next[1]])
            td_target = reward + gamma * Q[s_next[0], s_next[1], best_next_action]
            td_error = td_target - Q[s[0], s[1], a]
            Q[s[0], s[1], a] += alpha * td_error
            
            # Save frame for GIF - FIXED VERSION
            if save_gif and ep in gif_episodes:
                render_plot(env, ax=ax_temp, title=f"Episode {ep} - Step {steps}")
                fig_temp.canvas.draw()
                
                # Use the modern method to get canvas buffer
                buf = fig_temp.canvas.buffer_rgba()
                frame = np.asarray(buf)
                
                # Convert RGBA to RGB
                frame = frame[:, :, :3]
                gif_frames[ep].append(frame)
        
        if save_gif and ep in gif_episodes:
            plt.close(fig_temp)
        
        # Decay epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        epsilon_history.append(epsilon)
        
        # Track metrics
        rewards_per_episode.append(total_reward)
        steps_per_episode.append(steps)
        
        # Success tracking
        success = 1 if done and total_reward > 0 else 0
        if len(success_rate) == 0:
            success_rate.append(success)
        else:
            success_rate.append(0.99 * success_rate[-1] + 0.01 * success)
        
        # Progress logging
        if ep % 500 == 0:
            recent_avg = np.mean(rewards_per_episode[-100:])
            recent_success = success_rate[-1] * 100
            avg_steps = np.mean(steps_per_episode[-100:])
            print(f"Ep {ep:5d} | Avg Reward: {recent_avg:6.2f} | " +
                  f"Success: {recent_success:5.1f}% | " +
                  f"Avg Steps: {avg_steps:5.1f} | ε: {epsilon:.4f}")
    
    # Save GIFs
    if save_gif:
        print("\n🎬 Generating GIF animations...")
        for ep in gif_episodes:
            if gif_frames[ep]:
                imageio.mimsave(f'output/gifs/q_learning_episode_{ep}.gif', 
                              gif_frames[ep], fps=5)
                print(f"✅ Saved: q_learning_episode_{ep}.gif")
    
    # Save plots
    print("\n📊 Saving performance plots...")
    results = {
        'rewards': rewards_per_episode,
        'steps': steps_per_episode,
        'success_rate': success_rate,
        'epsilon': epsilon_history
    }
    save_training_plots(results, algorithm='QLearning', grid_size=f"{height}x{width}")
    
    # Save Q-table
    np.save('output/models/q_learning_Q_table.npy', Q)
    print("✅ Saved: q_learning_Q_table.npy")
    
    # Final stats
    print("\n" + "=" * 70)
    print("🎉 Q-Learning Training Completed!")
    print("=" * 70)
    final_avg = np.mean(rewards_per_episode[-100:])
    final_success = success_rate[-1] * 100
    final_steps = np.mean(steps_per_episode[-100:])
    print(f"Final Avg Reward (last 100): {final_avg:.2f}")
    print(f"Final Success Rate: {final_success:.1f}%")
    print(f"Final Avg Steps: {final_steps:.1f}")
    print(f"Final Epsilon: {epsilon:.4f}")
    
    return Q, results, env

In [15]:
def save_training_plots(results, algorithm='MonteCarlo', grid_size='5x5', window=100):
    """Save comprehensive training plots as PNG files."""
    
    rewards = results['rewards']
    steps = results['steps']
    success = results['success_rate']
    epsilon = results['epsilon']
    episodes = list(range(1, len(rewards) + 1))
    
    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle(f'{algorithm} Training Results - Grid {grid_size}', 
                 fontsize=18, fontweight='bold', y=0.995)
    
    # 1. Rewards with moving average
    axes[0, 0].plot(episodes, rewards, alpha=0.3, color='blue', linewidth=0.8, label='Raw Reward')
    if len(rewards) >= window:
        moving_avg = np.convolve(rewards, np.ones(window)/window, mode='valid')
        axes[0, 0].plot(range(window, len(rewards) + 1), moving_avg, 
                       color='red', linewidth=2.5, label=f'{window}-Episode MA')
    axes[0, 0].set_xlabel('Episode', fontsize=12, fontweight='bold')
    axes[0, 0].set_ylabel('Total Reward', fontsize=12, fontweight='bold')
    axes[0, 0].set_title('Training Rewards Over Time', fontsize=14, fontweight='bold')
    axes[0, 0].legend(loc='lower right', fontsize=10)
    axes[0, 0].grid(True, alpha=0.3, linestyle='--')
    
    # 2. Success rate
    axes[0, 1].plot(episodes, np.array(success) * 100, color='green', linewidth=2.5)
    axes[0, 1].fill_between(episodes, 0, np.array(success) * 100, alpha=0.3, color='green')
    axes[0, 1].set_xlabel('Episode', fontsize=12, fontweight='bold')
    axes[0, 1].set_ylabel('Success Rate (%)', fontsize=12, fontweight='bold')
    axes[0, 1].set_title('Success Rate Evolution', fontsize=14, fontweight='bold')
    axes[0, 1].set_ylim(0, 105)
    axes[0, 1].grid(True, alpha=0.3, linestyle='--')
    axes[0, 1].axhline(y=80, color='red', linestyle='--', linewidth=1.5, label='80% Target')
    axes[0, 1].legend(loc='lower right', fontsize=10)
    
    # 3. Steps per episode
    axes[1, 0].plot(episodes, steps, color='purple', alpha=0.4, linewidth=0.8, label='Raw Steps')
    if len(steps) >= window:
        steps_ma = np.convolve(steps, np.ones(window)/window, mode='valid')
        axes[1, 0].plot(range(window, len(steps) + 1), steps_ma, 
                       color='darkviolet', linewidth=2.5, label=f'{window}-Episode MA')
    axes[1, 0].set_xlabel('Episode', fontsize=12, fontweight='bold')
    axes[1, 0].set_ylabel('Steps to Completion', fontsize=12, fontweight='bold')
    axes[1, 0].set_title('Steps per Episode', fontsize=14, fontweight='bold')
    axes[1, 0].legend(loc='upper right', fontsize=10)
    axes[1, 0].grid(True, alpha=0.3, linestyle='--')
    
    # 4. Epsilon decay
    axes[1, 1].plot(episodes, epsilon, color='orange', linewidth=2.5)
    axes[1, 1].fill_between(episodes, 0, epsilon, alpha=0.3, color='orange')
    axes[1, 1].set_xlabel('Episode', fontsize=12, fontweight='bold')
    axes[1, 1].set_ylabel('Epsilon (ε)', fontsize=12, fontweight='bold')
    axes[1, 1].set_title('Exploration Rate Decay', fontsize=14, fontweight='bold')
    axes[1, 1].grid(True, alpha=0.3, linestyle='--')
    
    plt.tight_layout()
    
    # Save figure
    filename = f'output/plots/{algorithm}_training_results_{grid_size}.png'
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    print(f"✅ Saved: {filename}")
    plt.close()
    
    # Additional plot: Convergence analysis
    fig2, ax = plt.subplots(figsize=(14, 6))
    
    if len(rewards) >= window:
        moving_avg = np.convolve(rewards, np.ones(window)/window, mode='valid')
        ax.plot(range(window, len(rewards) + 1), moving_avg, 
               color='blue', linewidth=3, label='Reward MA')
    
    ax2 = ax.twinx()
    ax2.plot(episodes, np.array(success) * 100, color='green', 
            linewidth=3, label='Success Rate', alpha=0.7)
    
    ax.set_xlabel('Episode', fontsize=13, fontweight='bold')
    ax.set_ylabel('Average Reward', fontsize=13, fontweight='bold', color='blue')
    ax2.set_ylabel('Success Rate (%)', fontsize=13, fontweight='bold', color='green')
    ax.tick_params(axis='y', labelcolor='blue')
    ax2.tick_params(axis='y', labelcolor='green')
    
    ax.set_title(f'{algorithm} Convergence Analysis - Grid {grid_size}', 
                fontsize=16, fontweight='bold')
    ax.grid(True, alpha=0.3, linestyle='--')
    ax.legend(loc='upper left', fontsize=11)
    ax2.legend(loc='upper right', fontsize=11)
    
    filename2 = f'output/plots/{algorithm}_convergence_analysis_{grid_size}.png'
    plt.savefig(filename2, dpi=300, bbox_inches='tight')
    print(f"✅ Saved: {filename2}")
    plt.close()


In [16]:
def visualize_policy(env, Q, algorithm='MonteCarlo'):
    """Visualize and save the learned policy."""
    height = env["height"]
    width = env["width"]
    arrows = {0: '↑', 1: '→', 2: '↓', 3: '←'}
    
    fig, ax = plt.subplots(figsize=(max(10, width*2), max(8, height*2)))
    
    # Create policy grid
    policy_grid = np.zeros((height, width))
    
    for r in range(height):
        for c in range(width):
            best_action = np.argmax(Q[r, c])
            q_max = Q[r, c, best_action]
            if q_max != 0:
                policy_grid[r, c] = best_action + 1
    
    # Plot
    cmap = plt.cm.get_cmap('Set3', 5)
    im = ax.imshow(policy_grid, cmap=cmap, origin='upper', alpha=0.6)
    
    # Add arrows
    for r in range(height):
        for c in range(width):
            best_action = np.argmax(Q[r, c])
            q_max = Q[r, c, best_action]
            if q_max != 0:
                ax.text(c, r, arrows[best_action], ha='center', va='center', 
                       fontsize=20, fontweight='bold', color='black')
            else:
                ax.text(c, r, '?', ha='center', va='center', 
                       fontsize=16, color='gray', alpha=0.5)
    
    ax.set_xticks(range(width))
    ax.set_yticks(range(height))
    ax.set_title(f'{algorithm} Learned Policy - Grid {height}×{width}', 
                fontsize=16, fontweight='bold', pad=20)
    ax.grid(True, which='both', color='black', linewidth=1.5)
    
    # Add legend
    legend_labels = ['0: Unexplored', '1: UP ↑', '2: RIGHT →', '3: DOWN ↓', '4: LEFT ←']
    handles = [plt.Rectangle((0,0),1,1, fc=cmap(i)) for i in range(5)]
    ax.legend(handles, legend_labels, loc='center left', bbox_to_anchor=(1, 0.5), fontsize=11)
    
    plt.tight_layout()
    filename = f'output/plots/{algorithm}_learned_policy_{height}x{width}.png'
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    print(f"✅ Saved: {filename}")
    plt.close()

In [17]:
GRID_HEIGHT = 10
GRID_WIDTH = 10

# Train Monte Carlo
Q_mc, results_mc, env_mc = monte_carlo_train(
    height=GRID_HEIGHT,
    width=GRID_WIDTH,
    episodes=10000,
    gamma=0.95,
    epsilon=0.3,
    epsilon_decay=0.995,
    epsilon_min=0.01,
    max_steps=200,
    save_gif=True,
    gif_episodes=[1, 100, 500, 1000, 5000, 10000]
)

# Visualize policy
visualize_policy(env_mc, Q_mc, algorithm='MonteCarlo')

🚀 Starting Monte Carlo Training
Grid Size: 10×10 | Episodes: 10000 | γ: 0.95
----------------------------------------------------------------------
Ep   500 | Avg Reward: -188.72 | Success:   1.5% | Avg Steps: 185.3 | ε: 0.0245
Ep  1000 | Avg Reward: -198.77 | Success:   0.5% | Avg Steps: 198.6 | ε: 0.0100
Ep  1500 | Avg Reward: -189.63 | Success:   3.9% | Avg Steps: 190.1 | ε: 0.0100
Ep  2000 | Avg Reward: -195.96 | Success:   1.9% | Avg Steps: 196.0 | ε: 0.0100
Ep  2500 | Avg Reward: -187.11 | Success:   2.6% | Avg Steps: 187.9 | ε: 0.0100
Ep  3000 | Avg Reward: -193.92 | Success:   1.2% | Avg Steps: 194.4 | ε: 0.0100
Ep  3500 | Avg Reward: -197.58 | Success:   1.6% | Avg Steps: 197.8 | ε: 0.0100
Ep  4000 | Avg Reward: -188.94 | Success:   4.2% | Avg Steps: 189.4 | ε: 0.0100
Ep  4500 | Avg Reward: -190.94 | Success:   2.1% | Avg Steps: 191.5 | ε: 0.0100
Ep  5000 | Avg Reward: -191.60 | Success:   2.6% | Avg Steps: 192.2 | ε: 0.0100
Ep  5500 | Avg Reward: -186.09 | Success:   2.8% | A

  cmap = plt.cm.get_cmap('Set3', 5)


✅ Saved: output/plots/MonteCarlo_learned_policy_10x10.png


In [18]:
Q_ql, results_ql, env_ql = q_learning_train(
    height=GRID_HEIGHT,
    width=GRID_WIDTH,
    episodes=10000,
    gamma=0.95,
    alpha=0.1,
    epsilon=0.3,
    epsilon_decay=0.995,
    epsilon_min=0.01,
    max_steps=200,
    save_gif=True,
    gif_episodes=[1, 100, 500, 1000, 5000, 10000]
)

# Visualize policy
visualize_policy(env_ql, Q_ql, algorithm='QLearning')

🚀 Starting Q-Learning Training
Grid Size: 10×10 | Episodes: 10000 | γ: 0.95 | α: 0.1
----------------------------------------------------------------------
Ep   500 | Avg Reward: -78.26 | Success:  12.7% | Avg Steps:  85.0 | ε: 0.0245
Ep  1000 | Avg Reward: -76.72 | Success:   9.8% | Avg Steps:  84.4 | ε: 0.0100
Ep  1500 | Avg Reward: -81.58 | Success:  10.4% | Avg Steps:  89.3 | ε: 0.0100
Ep  2000 | Avg Reward: -100.96 | Success:  11.5% | Avg Steps: 106.7 | ε: 0.0100
Ep  2500 | Avg Reward: -107.55 | Success:  13.4% | Avg Steps: 113.1 | ε: 0.0100
Ep  3000 | Avg Reward: -108.16 | Success:  10.7% | Avg Steps: 113.7 | ε: 0.0100
Ep  3500 | Avg Reward: -102.57 | Success:  19.8% | Avg Steps: 107.4 | ε: 0.0100
Ep  4000 | Avg Reward: -108.95 | Success:  21.3% | Avg Steps: 114.1 | ε: 0.0100
Ep  4500 | Avg Reward: -156.08 | Success:  14.2% | Avg Steps: 157.8 | ε: 0.0100
Ep  5000 | Avg Reward: -176.86 | Success:   8.7% | Avg Steps: 177.7 | ε: 0.0100
Ep  5500 | Avg Reward: -147.84 | Success:  12.6

  cmap = plt.cm.get_cmap('Set3', 5)


✅ Saved: output/plots/QLearning_learned_policy_10x10.png


In [19]:
def compare_algorithms(results_mc, results_ql, grid_size='5x5'):
    """Compare Monte Carlo and Q-Learning performance."""
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle(f'Algorithm Comparison: Monte Carlo vs Q-Learning - Grid {grid_size}', 
                 fontsize=18, fontweight='bold', y=0.995)
    
    window = 100
    episodes = list(range(1, len(results_mc['rewards']) + 1))
    
    # 1. Rewards comparison
    if len(results_mc['rewards']) >= window:
        mc_ma = np.convolve(results_mc['rewards'], np.ones(window)/window, mode='valid')
        ql_ma = np.convolve(results_ql['rewards'], np.ones(window)/window, mode='valid')
        ma_episodes = list(range(window, len(results_mc['rewards']) + 1))
        
        axes[0, 0].plot(ma_episodes, mc_ma, color='blue', linewidth=2.5, label='Monte Carlo')
        axes[0, 0].plot(ma_episodes, ql_ma, color='red', linewidth=2.5, label='Q-Learning')
    
    axes[0, 0].set_xlabel('Episode', fontsize=12, fontweight='bold')
    axes[0, 0].set_ylabel('Average Reward', fontsize=12, fontweight='bold')
    axes[0, 0].set_title('Rewards Comparison (100-Episode MA)', fontsize=14, fontweight='bold')
    axes[0, 0].legend(fontsize=11)
    axes[0, 0].grid(True, alpha=0.3, linestyle='--')
    
    # 2. Success rate comparison
    axes[0, 1].plot(episodes, np.array(results_mc['success_rate']) * 100, 
                   color='blue', linewidth=2.5, label='Monte Carlo', alpha=0.8)
    axes[0, 1].plot(episodes, np.array(results_ql['success_rate']) * 100, 
                   color='red', linewidth=2.5, label='Q-Learning', alpha=0.8)
    axes[0, 1].fill_between(episodes, 0, np.array(results_mc['success_rate']) * 100, 
                           alpha=0.2, color='blue')
    axes[0, 1].fill_between(episodes, 0, np.array(results_ql['success_rate']) * 100, 
                           alpha=0.2, color='red')
    axes[0, 1].set_xlabel('Episode', fontsize=12, fontweight='bold')
    axes[0, 1].set_ylabel('Success Rate (%)', fontsize=12, fontweight='bold')
    axes[0, 1].set_title('Success Rate Comparison', fontsize=14, fontweight='bold')
    axes[0, 1].legend(fontsize=11)
    axes[0, 1].grid(True, alpha=0.3, linestyle='--')
    axes[0, 1].set_ylim(0, 105)
    
    # 3. Steps comparison
    if len(results_mc['steps']) >= window:
        mc_steps_ma = np.convolve(results_mc['steps'], np.ones(window)/window, mode='valid')
        ql_steps_ma = np.convolve(results_ql['steps'], np.ones(window)/window, mode='valid')
        
        axes[1, 0].plot(ma_episodes, mc_steps_ma, color='blue', linewidth=2.5, label='Monte Carlo')
        axes[1, 0].plot(ma_episodes, ql_steps_ma, color='red', linewidth=2.5, label='Q-Learning')
    
    axes[1, 0].set_xlabel('Episode', fontsize=12, fontweight='bold')
    axes[1, 0].set_ylabel('Average Steps', fontsize=12, fontweight='bold')
    axes[1, 0].set_title('Steps to Completion (100-Episode MA)', fontsize=14, fontweight='bold')
    axes[1, 0].legend(fontsize=11)
    axes[1, 0].grid(True, alpha=0.3, linestyle='--')
    
    # 4. Final performance statistics
    mc_final_reward = np.mean(results_mc['rewards'][-100:])
    ql_final_reward = np.mean(results_ql['rewards'][-100:])
    mc_final_success = results_mc['success_rate'][-1] * 100
    ql_final_success = results_ql['success_rate'][-1] * 100
    mc_final_steps = np.mean(results_mc['steps'][-100:])
    ql_final_steps = np.mean(results_ql['steps'][-100:])
    
    categories = ['Avg Reward\n(last 100)', 'Success Rate\n(%)', 'Avg Steps\n(last 100)']
    mc_values = [mc_final_reward, mc_final_success, mc_final_steps]
    ql_values = [ql_final_reward, ql_final_success, ql_final_steps]
    
    x = np.arange(len(categories))
    width = 0.35
    
    # Normalize for visualization
    max_vals = [max(mc_final_reward, ql_final_reward), 
                max(mc_final_success, ql_final_success), 
                max(mc_final_steps, ql_final_steps)]
    
    bars1 = axes[1, 1].bar(x - width/2, mc_values, width, label='Monte Carlo', 
                          color='blue', alpha=0.7)
    bars2 = axes[1, 1].bar(x + width/2, ql_values, width, label='Q-Learning', 
                          color='red', alpha=0.7)
    
    axes[1, 1].set_ylabel('Value', fontsize=12, fontweight='bold')
    axes[1, 1].set_title('Final Performance Metrics', fontsize=14, fontweight='bold')
    axes[1, 1].set_xticks(x)
    axes[1, 1].set_xticklabels(categories, fontsize=10)
    axes[1, 1].legend(fontsize=11)
    axes[1, 1].grid(True, alpha=0.3, linestyle='--', axis='y')
    
    # Add value labels on bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            axes[1, 1].text(bar.get_x() + bar.get_width()/2., height,
                          f'{height:.1f}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    
    filename = f'output/plots/algorithm_comparison_{grid_size}.png'
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    print(f"✅ Saved: {filename}")
    plt.close()
    
    # Print summary statistics
    print("\n" + "=" * 70)
    print("📊 ALGORITHM COMPARISON SUMMARY")
    print("=" * 70)
    print(f"\n{'Metric':<30} {'Monte Carlo':>15} {'Q-Learning':>15} {'Winner':>10}")
    print("-" * 70)
    
    winner_reward = "MC" if mc_final_reward > ql_final_reward else "QL"
    winner_success = "MC" if mc_final_success > ql_final_success else "QL"
    winner_steps = "MC" if mc_final_steps < ql_final_steps else "QL"  # Lower is better
    
    print(f"{'Avg Reward (last 100)':<30} {mc_final_reward:>15.2f} {ql_final_reward:>15.2f} {winner_reward:>10}")
    print(f"{'Success Rate (%)':<30} {mc_final_success:>15.1f} {ql_final_success:>15.1f} {winner_success:>10}")
    print(f"{'Avg Steps (last 100)':<30} {mc_final_steps:>15.1f} {ql_final_steps:>15.1f} {winner_steps:>10}")
    print("=" * 70)

# Compare the two algorithms
compare_algorithms(results_mc, results_ql, grid_size=f'{GRID_HEIGHT}x{GRID_WIDTH}')


✅ Saved: output/plots/algorithm_comparison_10x10.png

📊 ALGORITHM COMPARISON SUMMARY

Metric                             Monte Carlo      Q-Learning     Winner
----------------------------------------------------------------------
Avg Reward (last 100)                  -193.59         -180.02         QL
Success Rate (%)                           0.4             9.0         QL
Avg Steps (last 100)                     192.4           181.0         QL


In [20]:
def test_agent(env, Q, num_tests=10, max_steps=50, algorithm='MonteCarlo'):
    """Test trained agent on new random environments."""
    
    print("\n" + "=" * 70)
    print(f"🧪 TESTING {algorithm.upper()} AGENT")
    print("=" * 70)
    
    test_results = []
    
    for test_ep in range(1, num_tests + 1):
        # Generate new random environment
        reset(env, randomize=True)
        
        total_reward = 0
        steps = 0
        done = False
        
        while not done and steps < max_steps:
            s = tuple(env["agent_pos"])
            
            # Greedy policy (no exploration)
            action = np.argmax(Q[s[0], s[1]])
            new_state, reward, done = step(env, action)
            total_reward += reward
            steps += 1
        
        success = done and total_reward > 0
        test_results.append({
            'episode': test_ep,
            'steps': steps,
            'reward': total_reward,
            'success': success
        })
        
        status = "✅ SUCCESS" if success else "❌ FAILED"
        print(f"Test {test_ep:2d}/{num_tests}: {status} | Steps: {steps:3d} | Reward: {total_reward:6.1f}")
    
    # Summary
    print("\n" + "-" * 70)
    successes = sum([r['success'] for r in test_results])
    avg_steps = np.mean([r['steps'] for r in test_results])
    avg_reward = np.mean([r['reward'] for r in test_results])
    
    print(f"Success Rate: {successes}/{num_tests} ({successes/num_tests*100:.1f}%)")
    print(f"Average Steps: {avg_steps:.1f}")
    print(f"Average Reward: {avg_reward:.1f}")
    print("=" * 70)
    
    return test_results

# Test Monte Carlo agent
test_results_mc = test_agent(env_mc, Q_mc, num_tests=20, algorithm='MonteCarlo')




🧪 TESTING MONTECARLO AGENT
Test  1/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  2/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  3/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  4/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  5/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  6/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  7/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  8/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  9/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 10/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 11/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 12/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 13/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 14/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 15/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 16/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 17/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 18/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 19/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Tes

In [21]:
# Test Q-Learning agent
test_results_ql = test_agent(env_ql, Q_ql, num_tests=20, algorithm='QLearning')


🧪 TESTING QLEARNING AGENT
Test  1/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  2/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  3/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  4/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  5/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  6/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  7/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  8/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test  9/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 10/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 11/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 12/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 13/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 14/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 15/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 16/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 17/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 18/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test 19/20: ❌ FAILED | Steps:  50 | Reward:  -50.0
Test

In [22]:
def generate_summary_report(results_mc, results_ql, test_results_mc, test_results_ql, 
                           grid_height, grid_width):
    """Generate and save a comprehensive summary report."""
    
    report = []
    report.append("=" * 80)
    report.append("🎮 GRIDWORLD REINFORCEMENT LEARNING - TRAINING SUMMARY REPORT")
    report.append("=" * 80)
    report.append(f"\n📐 Grid Configuration: {grid_height} × {grid_width}")
    report.append(f"🔄 Dynamic Environment: Goals and obstacles randomized each episode")
    report.append("")
    
    # Monte Carlo Results
    report.append("\n" + "=" * 80)
    report.append("🎲 MONTE CARLO RESULTS")
    report.append("=" * 80)
    mc_final_reward = np.mean(results_mc['rewards'][-100:])
    mc_final_success = results_mc['success_rate'][-1] * 100
    mc_final_steps = np.mean(results_mc['steps'][-100:])
    mc_test_success = sum([r['success'] for r in test_results_mc]) / len(test_results_mc) * 100
    
    report.append(f"Training Episodes: {len(results_mc['rewards'])}")
    report.append(f"Final Average Reward (last 100): {mc_final_reward:.2f}")
    report.append(f"Final Success Rate: {mc_final_success:.1f}%")
    report.append(f"Final Average Steps: {mc_final_steps:.1f}")
    report.append(f"Test Success Rate (20 episodes): {mc_test_success:.1f}%")
    
    # Q-Learning Results
    report.append("\n" + "=" * 80)
    report.append("🔄 Q-LEARNING RESULTS")
    report.append("=" * 80)
    ql_final_reward = np.mean(results_ql['rewards'][-100:])
    ql_final_success = results_ql['success_rate'][-1] * 100
    ql_final_steps = np.mean(results_ql['steps'][-100:])
    ql_test_success = sum([r['success'] for r in test_results_ql]) / len(test_results_ql) * 100
    
    report.append(f"Training Episodes: {len(results_ql['rewards'])}")
    report.append(f"Final Average Reward (last 100): {ql_final_reward:.2f}")
    report.append(f"Final Success Rate: {ql_final_success:.1f}%")
    report.append(f"Final Average Steps: {ql_final_steps:.1f}")
    report.append(f"Test Success Rate (20 episodes): {ql_test_success:.1f}%")
    
    # Comparison
    report.append("\n" + "=" * 80)
    report.append("⚔️ ALGORITHM COMPARISON")
    report.append("=" * 80)
    
    if mc_final_reward > ql_final_reward:
        report.append(f"🏆 Reward Winner: Monte Carlo ({mc_final_reward:.2f} vs {ql_final_reward:.2f})")
    else:
        report.append(f"🏆 Reward Winner: Q-Learning ({ql_final_reward:.2f} vs {mc_final_reward:.2f})")
    
    if mc_final_success > ql_final_success:
        report.append(f"🏆 Success Winner: Monte Carlo ({mc_final_success:.1f}% vs {ql_final_success:.1f}%)")
    else:
        report.append(f"🏆 Success Winner: Q-Learning ({ql_final_success:.1f}% vs {mc_final_success:.1f}%)")
    
    if mc_final_steps < ql_final_steps:
        report.append(f"🏆 Efficiency Winner: Monte Carlo ({mc_final_steps:.1f} vs {ql_final_steps:.1f} steps)")
    else:
        report.append(f"🏆 Efficiency Winner: Q-Learning ({ql_final_steps:.1f} vs {mc_final_steps:.1f} steps)")
    
    # Output files
    report.append("\n" + "=" * 80)
    report.append("📁 GENERATED OUTPUT FILES")
    report.append("=" * 80)
    report.append("\n🎬 GIF Animations:")
    report.append("  - Monte Carlo: episodes 1, 100, 500, 1000, 5000, 10000")
    report.append("  - Q-Learning: episodes 1, 100, 500, 1000, 5000, 10000")
    report.append("\n📊 Performance Plots:")
    report.append("  - MonteCarlo_training_results.png")
    report.append("  - MonteCarlo_convergence_analysis.png")
    report.append("  - MonteCarlo_learned_policy.png")
    report.append("  - QLearning_training_results.png")
    report.append("  - QLearning_convergence_analysis.png")
    report.append("  - QLearning_learned_policy.png")
    report.append("  - algorithm_comparison.png")
    report.append("\n💾 Saved Models:")
    report.append("  - monte_carlo_Q_table.npy")
    report.append("  - q_learning_Q_table.npy")
    
    report.append("\n" + "=" * 80)
    report.append("✅ TRAINING COMPLETED SUCCESSFULLY")
    report.append("=" * 80)
    
    # Print report
    report_text = "\n".join(report)
    print(report_text)
    
    # Save report to file
    with open('output/training_summary_report.txt', 'w') as f:
        f.write(report_text)
    
    print("\n✅ Summary report saved: output/training_summary_report.txt")

# Generate the summary report
generate_summary_report(results_mc, results_ql, test_results_mc, test_results_ql, 
                       GRID_HEIGHT, GRID_WIDTH)


🎮 GRIDWORLD REINFORCEMENT LEARNING - TRAINING SUMMARY REPORT

📐 Grid Configuration: 10 × 10
🔄 Dynamic Environment: Goals and obstacles randomized each episode


🎲 MONTE CARLO RESULTS
Training Episodes: 10000
Final Average Reward (last 100): -193.59
Final Success Rate: 0.4%
Final Average Steps: 192.4
Test Success Rate (20 episodes): 0.0%

🔄 Q-LEARNING RESULTS
Training Episodes: 10000
Final Average Reward (last 100): -180.02
Final Success Rate: 9.0%
Final Average Steps: 181.0
Test Success Rate (20 episodes): 0.0%

⚔️ ALGORITHM COMPARISON
🏆 Reward Winner: Q-Learning (-180.02 vs -193.59)
🏆 Success Winner: Q-Learning (9.0% vs 0.4%)
🏆 Efficiency Winner: Q-Learning (181.0 vs 192.4 steps)

📁 GENERATED OUTPUT FILES

🎬 GIF Animations:
  - Monte Carlo: episodes 1, 100, 500, 1000, 5000, 10000
  - Q-Learning: episodes 1, 100, 500, 1000, 5000, 10000

📊 Performance Plots:
  - MonteCarlo_training_results.png
  - MonteCarlo_convergence_analysis.png
  - MonteCarlo_learned_policy.png
  - QLearning_traini