# Model Evaluation Notebook

This notebook evaluates all trained RL models and produces comparison figures for the final report.

**Models Evaluated:**
- Single-Snake: DQN, Double DQN, Dueling DQN, PER DQN, Noisy DQN, PPO, A2C, REINFORCE
- Two-Snake: Classic DQN, PPO Co-evolution, PPO Curriculum, DQN Curriculum
- Baselines: Random Agent, Shortest Path (A*)

**Output:**
- Comparison figures saved to `results/figures/`
- Metrics saved to `results/data/`

In [None]:
# Cell 1: Imports
import sys
from pathlib import Path

# Add project root to path (handle both notebook and papermill execution)
if Path.cwd().name == 'notebooks':
    project_root = Path.cwd().parent
else:
    # When run with papermill, cwd might be project root
    project_root = Path.cwd()
    if not (project_root / 'core').exists():
        project_root = project_root.parent

sys.path.insert(0, str(project_root))

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from datetime import datetime
from typing import Dict, List, Tuple, Optional

from core.environment_vectorized import VectorizedSnakeEnv
from core.environment_two_snake_vectorized import VectorizedTwoSnakeEnv
from core.environment import SnakeEnv
from core.networks import (
    DQN_MLP, DQN_CNN, 
    DuelingDQN_MLP, NoisyDQN_MLP,
    PPO_Actor_MLP, PPO_Critic_MLP,
    PPO_Actor_CNN, PPO_Critic_CNN
)
from core.utils import set_seed, get_device
from scripts.baselines.random_agent import RandomAgent
from scripts.baselines.shortest_path import ShortestPathAgent

print(f"Project root: {project_root}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# Cell 2: Configuration
# ============== CONFIGURATION ==============

# Evaluation settings
NUM_EPISODES = 100          # Episodes per model
GRID_SIZE = 10              # Standard grid
MAX_STEPS = 1000            # Max steps per episode
SEED = 42                   # Reproducibility
NUM_ENVS = 100              # Parallel environments for speed

# Paths
WEIGHTS_DIR = project_root / 'results' / 'weights'
FIGURES_DIR = project_root / 'results' / 'figures'
DATA_DIR = project_root / 'results' / 'data'

# Ensure directories exist
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Device
set_seed(SEED)
device = get_device()
print(f"Using device: {device}")
print(f"Weights directory: {WEIGHTS_DIR}")

## Utility Functions

In [None]:
# Cell 4: Model Loading Functions

def get_input_dim(use_flood_fill: bool = False, use_selective: bool = False, use_enhanced: bool = False) -> int:
    """Get input dimension based on feature flags"""
    if use_enhanced:
        return 24
    if use_selective:
        return 19
    if use_flood_fill:
        return 14
    return 11


def parse_model_info(filename: str) -> Dict:
    """Parse model information from filename"""
    info = {
        'algorithm': 'unknown',
        'network': 'mlp',
        'state_rep': 'basic',
        'use_flood_fill': False,
        'use_enhanced': False,
        'use_selective': False,
        'hidden_dims': (128, 128)
    }
    
    filename_lower = filename.lower()
    
    # Detect algorithm
    if 'dueling' in filename_lower:
        info['algorithm'] = 'dueling_dqn'
    elif 'noisy' in filename_lower:
        info['algorithm'] = 'noisy_dqn'
    elif 'per_dqn' in filename_lower:
        info['algorithm'] = 'per_dqn'
    elif 'double_dqn' in filename_lower:
        info['algorithm'] = 'double_dqn'
    elif 'dqn' in filename_lower:
        info['algorithm'] = 'dqn'
    elif 'ppo' in filename_lower:
        info['algorithm'] = 'ppo'
    elif 'a2c' in filename_lower:
        info['algorithm'] = 'a2c'
    elif 'reinforce' in filename_lower:
        info['algorithm'] = 'reinforce'
    
    # Detect network type
    if 'cnn' in filename_lower:
        info['network'] = 'cnn'
    
    # Detect state representation
    if 'floodfill' in filename_lower or 'flood_fill' in filename_lower:
        info['use_flood_fill'] = True
        info['state_rep'] = 'flood_fill'
    if 'enhanced' in filename_lower:
        info['use_enhanced'] = True
        info['state_rep'] = 'enhanced'
    if 'selective' in filename_lower:
        info['use_selective'] = True
        info['state_rep'] = 'selective'
    
    # Detect hidden dims
    if '256x256' in filename_lower:
        info['hidden_dims'] = (256, 256)
    
    return info


def load_dqn_model(filepath: Path, device: torch.device) -> Tuple[torch.nn.Module, Dict]:
    """Load a DQN-family model from checkpoint"""
    checkpoint = torch.load(filepath, map_location=device, weights_only=False)
    info = parse_model_info(filepath.name)
    
    # Get input dimension
    input_dim = get_input_dim(
        use_flood_fill=info['use_flood_fill'],
        use_selective=info['use_selective'],
        use_enhanced=info['use_enhanced']
    )
    
    # Create model based on type
    if info['network'] == 'cnn':
        model = DQN_CNN(grid_size=GRID_SIZE, input_channels=3, output_dim=3)
    elif info['algorithm'] == 'dueling_dqn':
        model = DuelingDQN_MLP(input_dim=input_dim, output_dim=3, hidden_dims=info['hidden_dims'])
    elif info['algorithm'] == 'noisy_dqn':
        model = NoisyDQN_MLP(input_dim=input_dim, output_dim=3, hidden_dims=info['hidden_dims'])
    else:
        model = DQN_MLP(input_dim=input_dim, output_dim=3, hidden_dims=info['hidden_dims'])
    
    # Load weights
    if 'policy_net' in checkpoint:
        model.load_state_dict(checkpoint['policy_net'])
    elif 'model' in checkpoint:
        model.load_state_dict(checkpoint['model'])
    else:
        model.load_state_dict(checkpoint)
    
    model.to(device)
    model.eval()
    
    return model, info


def load_ppo_model(filepath: Path, device: torch.device) -> Tuple[torch.nn.Module, Dict]:
    """Load a PPO/A2C/REINFORCE model from checkpoint"""
    checkpoint = torch.load(filepath, map_location=device, weights_only=False)
    info = parse_model_info(filepath.name)
    
    # Get input dimension
    input_dim = get_input_dim(
        use_flood_fill=info['use_flood_fill'],
        use_selective=info['use_selective'],
        use_enhanced=info['use_enhanced']
    )
    
    # Create model based on type
    if info['network'] == 'cnn':
        model = PPO_Actor_CNN(grid_size=GRID_SIZE, input_channels=3, output_dim=3)
    else:
        model = PPO_Actor_MLP(input_dim=input_dim, output_dim=3, hidden_dims=info['hidden_dims'])
    
    # Load weights - handle different checkpoint formats
    if 'actor' in checkpoint:
        model.load_state_dict(checkpoint['actor'])
    elif 'actor_state_dict' in checkpoint:
        # A2C format
        model.load_state_dict(checkpoint['actor_state_dict'])
    elif 'policy' in checkpoint:
        # REINFORCE format
        model.load_state_dict(checkpoint['policy'])
    elif 'policy_net' in checkpoint:
        model.load_state_dict(checkpoint['policy_net'])
    elif 'model' in checkpoint:
        model.load_state_dict(checkpoint['model'])
    else:
        model.load_state_dict(checkpoint)
    
    model.to(device)
    model.eval()
    
    return model, info


print("Model loading functions defined.")

In [None]:
# Cell 5: Evaluation Functions

def evaluate_dqn_model(
    model: torch.nn.Module,
    info: Dict,
    num_episodes: int = NUM_EPISODES,
    device: torch.device = device
) -> Dict:
    """Evaluate a DQN-family model"""
    
    # Create environment with matching state representation
    state_rep = 'grid' if info['network'] == 'cnn' else 'feature'
    
    env = VectorizedSnakeEnv(
        num_envs=num_episodes,
        grid_size=GRID_SIZE,
        action_space_type='relative',
        state_representation=state_rep,
        max_steps=MAX_STEPS,
        use_flood_fill=info['use_flood_fill'],
        use_enhanced_features=info['use_enhanced'],
        use_selective_features=info['use_selective'],
        device=device
    )
    
    # Run evaluation
    obs = env.reset(seed=SEED)
    
    scores = np.zeros(num_episodes)
    rewards = np.zeros(num_episodes)
    lengths = np.zeros(num_episodes)
    done_mask = np.zeros(num_episodes, dtype=bool)
    episode_rewards = np.zeros(num_episodes)
    
    for step in range(MAX_STEPS):
        with torch.no_grad():
            q_values = model(obs)
            actions = q_values.argmax(dim=1)
        
        obs, step_rewards, dones, info_dict = env.step(actions)
        
        # Track rewards for non-done episodes
        episode_rewards += step_rewards.cpu().numpy() * ~done_mask
        
        # Record finished episodes
        new_done = dones.cpu().numpy() & ~done_mask
        if new_done.any():
            done_indices = np.where(new_done)[0]
            for idx in done_indices:
                scores[idx] = info_dict['scores'][idx].item()
                rewards[idx] = episode_rewards[idx]
                lengths[idx] = step + 1
            done_mask |= new_done
        
        if done_mask.all():
            break
    
    # Handle any remaining episodes
    remaining = ~done_mask
    if remaining.any():
        scores[remaining] = info_dict['scores'][remaining].cpu().numpy()
        rewards[remaining] = episode_rewards[remaining]
        lengths[remaining] = MAX_STEPS
    
    return {
        'avg_score': float(np.mean(scores)),
        'std_score': float(np.std(scores)),
        'max_score': int(np.max(scores)),
        'min_score': int(np.min(scores)),
        'avg_reward': float(np.mean(rewards)),
        'avg_length': float(np.mean(lengths)),
        'scores': scores.tolist()
    }


def evaluate_policy_model(
    model: torch.nn.Module,
    info: Dict,
    num_episodes: int = NUM_EPISODES,
    device: torch.device = device
) -> Dict:
    """Evaluate a policy gradient model (PPO, A2C, REINFORCE)"""
    
    # Create environment with matching state representation
    state_rep = 'grid' if info['network'] == 'cnn' else 'feature'
    
    env = VectorizedSnakeEnv(
        num_envs=num_episodes,
        grid_size=GRID_SIZE,
        action_space_type='relative',
        state_representation=state_rep,
        max_steps=MAX_STEPS,
        use_flood_fill=info['use_flood_fill'],
        use_enhanced_features=info['use_enhanced'],
        use_selective_features=info['use_selective'],
        device=device
    )
    
    # Run evaluation
    obs = env.reset(seed=SEED)
    
    scores = np.zeros(num_episodes)
    rewards = np.zeros(num_episodes)
    lengths = np.zeros(num_episodes)
    done_mask = np.zeros(num_episodes, dtype=bool)
    episode_rewards = np.zeros(num_episodes)
    
    for step in range(MAX_STEPS):
        with torch.no_grad():
            logits = model(obs)
            # Use greedy action selection for evaluation
            actions = logits.argmax(dim=1)
        
        obs, step_rewards, dones, info_dict = env.step(actions)
        
        # Track rewards for non-done episodes
        episode_rewards += step_rewards.cpu().numpy() * ~done_mask
        
        # Record finished episodes
        new_done = dones.cpu().numpy() & ~done_mask
        if new_done.any():
            done_indices = np.where(new_done)[0]
            for idx in done_indices:
                scores[idx] = info_dict['scores'][idx].item()
                rewards[idx] = episode_rewards[idx]
                lengths[idx] = step + 1
            done_mask |= new_done
        
        if done_mask.all():
            break
    
    # Handle any remaining episodes
    remaining = ~done_mask
    if remaining.any():
        scores[remaining] = info_dict['scores'][remaining].cpu().numpy()
        rewards[remaining] = episode_rewards[remaining]
        lengths[remaining] = MAX_STEPS
    
    return {
        'avg_score': float(np.mean(scores)),
        'std_score': float(np.std(scores)),
        'max_score': int(np.max(scores)),
        'min_score': int(np.min(scores)),
        'avg_reward': float(np.mean(rewards)),
        'avg_length': float(np.mean(lengths)),
        'scores': scores.tolist()
    }


def evaluate_baseline(
    agent,
    num_episodes: int = NUM_EPISODES
) -> Dict:
    """Evaluate a baseline agent (Random or A*)"""
    
    env = SnakeEnv(
        grid_size=GRID_SIZE,
        action_space_type='relative',
        state_representation='feature',
        max_steps=MAX_STEPS
    )
    
    scores = []
    rewards = []
    lengths = []
    
    for episode in range(num_episodes):
        obs, info = env.reset(seed=SEED + episode)
        total_reward = 0
        
        for step in range(MAX_STEPS):
            action = agent.get_action(env)
            obs, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            
            if terminated or truncated:
                break
        
        scores.append(info['score'])
        rewards.append(total_reward)
        lengths.append(step + 1)
    
    return {
        'avg_score': float(np.mean(scores)),
        'std_score': float(np.std(scores)),
        'max_score': int(np.max(scores)),
        'min_score': int(np.min(scores)),
        'avg_reward': float(np.mean(rewards)),
        'avg_length': float(np.mean(lengths)),
        'scores': scores
    }


print("Evaluation functions defined.")

## Single-Snake Model Evaluation

In [None]:
# Cell 7: Discover and Evaluate Single-Snake Models

# Define which weight files to evaluate (prefer 5000ep versions when available)
single_snake_models = {
    # DQN variants
    'DQN MLP Basic': 'dqn_mlp_128x128_5000ep_*.pt',
    'DQN MLP Flood-fill': 'dqn_mlp_floodfill_128x128_5000ep_*.pt',
    'DQN CNN': 'dqn_cnn_5000ep_*.pt',
    'Double DQN MLP': 'double_dqn_mlp_128x128_5000ep_*.pt',
    'Double DQN MLP Flood-fill': 'double_dqn_mlp_floodfill_128x128_5000ep_*.pt',
    'Double DQN CNN': 'double_dqn_cnn_5000ep_*.pt',
    'Dueling DQN MLP': 'dueling_dqn_mlp_128x128_5000ep_*.pt',
    'Dueling DQN MLP Flood-fill': 'dueling_dqn_mlp_floodfill_128x128_5000ep_*.pt',
    'PER DQN MLP': 'per_dqn_mlp_128x128_5000ep_*.pt',
    'PER DQN MLP Flood-fill': 'per_dqn_mlp_floodfill_128x128_5000ep_*.pt',
    # Policy gradient
    'PPO MLP': 'ppo_mlp_128x128_5000ep_*.pt',
    'PPO MLP Flood-fill': 'ppo_mlp_floodfill_128x128_5000ep_*.pt',
    'PPO CNN': 'ppo_cnn_5000ep_*.pt',
    'A2C MLP Flood-fill': 'a2c_floodfill_128x128_5000ep_*.pt',
    'REINFORCE MLP': 'reinforce_mlp_128x128_5000ep_*.pt',
    'REINFORCE MLP Flood-fill': 'reinforce_mlp_floodfill_128x128_5000ep_*.pt',
    'REINFORCE CNN': 'reinforce_cnn_5000ep_*.pt',
}

# Find weight files
def find_weight_file(pattern: str) -> Optional[Path]:
    """Find weight file matching pattern"""
    matches = list(WEIGHTS_DIR.glob(pattern))
    if matches:
        # Return most recent if multiple
        return sorted(matches, key=lambda x: x.stat().st_mtime, reverse=True)[0]
    return None


# Evaluate all models
results = {}
print("Evaluating single-snake models...")
print("=" * 60)

for model_name, pattern in single_snake_models.items():
    filepath = find_weight_file(pattern)
    
    if filepath is None:
        print(f"  {model_name}: NOT FOUND ({pattern})")
        continue
    
    print(f"  {model_name}: {filepath.name}")
    
    try:
        # Determine if DQN or policy gradient
        is_policy = any(x in model_name.lower() for x in ['ppo', 'a2c', 'reinforce'])
        
        if is_policy:
            model, info = load_ppo_model(filepath, device)
            result = evaluate_policy_model(model, info)
        else:
            model, info = load_dqn_model(filepath, device)
            result = evaluate_dqn_model(model, info)
        
        result['model_path'] = str(filepath)
        result['algorithm'] = info['algorithm']
        result['network'] = info['network']
        result['state_rep'] = info['state_rep']
        
        results[model_name] = result
        print(f"    -> Avg Score: {result['avg_score']:.2f} +/- {result['std_score']:.2f}")
        
    except Exception as e:
        print(f"    -> ERROR: {e}")

print("\nEvaluation complete!")
print(f"Successfully evaluated: {len(results)} models")

## Baseline Evaluation

In [None]:
# Cell 9: Evaluate Baselines

print("Evaluating baseline agents...")
print("=" * 60)

# Random Agent
print("  Random Agent...")
random_agent = RandomAgent(action_space_type='relative', seed=SEED)
random_result = evaluate_baseline(random_agent)
random_result['algorithm'] = 'baseline'
random_result['network'] = 'n/a'
random_result['state_rep'] = 'n/a'
results['Random Agent'] = random_result
print(f"    -> Avg Score: {random_result['avg_score']:.2f} +/- {random_result['std_score']:.2f}")

# Shortest Path (A*)
print("  Shortest Path (A*)...")
astar_agent = ShortestPathAgent(action_space_type='relative')
astar_result = evaluate_baseline(astar_agent)
astar_result['algorithm'] = 'baseline'
astar_result['network'] = 'n/a'
astar_result['state_rep'] = 'n/a'
results['Shortest Path (A*)'] = astar_result
print(f"    -> Avg Score: {astar_result['avg_score']:.2f} +/- {astar_result['std_score']:.2f}")

print("\nBaseline evaluation complete!")

## Results Summary

In [None]:
# Cell 11: Create Results DataFrame

# Build summary dataframe
summary_data = []
for model_name, result in results.items():
    summary_data.append({
        'Model': model_name,
        'Avg Score': result['avg_score'],
        'Std Score': result['std_score'],
        'Max Score': result['max_score'],
        'Min Score': result['min_score'],
        'Avg Reward': result['avg_reward'],
        'Avg Length': result['avg_length'],
        'Algorithm': result.get('algorithm', 'unknown'),
        'Network': result.get('network', 'unknown'),
        'State Rep': result.get('state_rep', 'unknown')
    })

df = pd.DataFrame(summary_data)
df = df.sort_values('Avg Score', ascending=False)

print("\n" + "=" * 80)
print("SINGLE-SNAKE MODEL COMPARISON")
print("=" * 80)
print(df[['Model', 'Avg Score', 'Std Score', 'Max Score', 'Avg Length']].to_string(index=False))
print("=" * 80)

## Visualizations

In [None]:
# Cell 13: Generate Comparison Charts

# Sort by average score for plotting
df_sorted = df.sort_values('Avg Score', ascending=True)

# Figure 1: Overall Comparison Bar Chart
fig1, ax1 = plt.subplots(figsize=(12, 8))

colors = []
for model in df_sorted['Model']:
    if 'Random' in model or 'Shortest' in model:
        colors.append('#888888')  # Gray for baselines
    elif 'PPO' in model:
        colors.append('#2ecc71')  # Green for PPO
    elif 'A2C' in model:
        colors.append('#27ae60')  # Darker green for A2C
    elif 'REINFORCE' in model:
        colors.append('#1abc9c')  # Teal for REINFORCE
    elif 'DQN' in model:
        colors.append('#3498db')  # Blue for DQN variants
    else:
        colors.append('#9b59b6')  # Purple for others

bars = ax1.barh(df_sorted['Model'], df_sorted['Avg Score'], 
                xerr=df_sorted['Std Score'], capsize=3, color=colors, alpha=0.8)
ax1.set_xlabel('Average Score (Food Eaten)', fontsize=12)
ax1.set_title('Single-Snake Model Performance Comparison', fontsize=14)
ax1.grid(axis='x', alpha=0.3)

# Add value labels
for bar, score, std in zip(bars, df_sorted['Avg Score'], df_sorted['Std Score']):
    ax1.text(bar.get_width() + std + 0.5, bar.get_y() + bar.get_height()/2,
             f'{score:.1f}', va='center', fontsize=9)

plt.tight_layout()
fig1.savefig(FIGURES_DIR / 'single_snake_comparison.png', dpi=150, bbox_inches='tight')
print(f"Saved: {FIGURES_DIR / 'single_snake_comparison.png'}")
plt.show()

In [None]:
# Cell 14: Algorithm Family Comparison

# Group by algorithm family
dqn_models = df[df['Model'].str.contains('DQN', case=False) & ~df['Model'].str.contains('Random|Shortest')]
pg_models = df[df['Model'].str.contains('PPO|A2C|REINFORCE', case=False)]
baselines = df[df['Model'].str.contains('Random|Shortest')]

fig2, ax2 = plt.subplots(figsize=(10, 6))

families = ['Baselines', 'DQN Family', 'Policy Gradient']
means = [
    baselines['Avg Score'].mean() if len(baselines) > 0 else 0,
    dqn_models['Avg Score'].mean() if len(dqn_models) > 0 else 0,
    pg_models['Avg Score'].mean() if len(pg_models) > 0 else 0
]
stds = [
    baselines['Avg Score'].std() if len(baselines) > 0 else 0,
    dqn_models['Avg Score'].std() if len(dqn_models) > 0 else 0,
    pg_models['Avg Score'].std() if len(pg_models) > 0 else 0
]

bars = ax2.bar(families, means, yerr=stds, capsize=5,
               color=['#888888', '#3498db', '#2ecc71'], alpha=0.8)
ax2.set_ylabel('Average Score', fontsize=12)
ax2.set_title('Algorithm Family Comparison', fontsize=14)
ax2.grid(axis='y', alpha=0.3)

# Add value labels
for bar, mean in zip(bars, means):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
             f'{mean:.1f}', ha='center', fontsize=11)

plt.tight_layout()
fig2.savefig(FIGURES_DIR / 'algorithm_family_comparison.png', dpi=150, bbox_inches='tight')
print(f"Saved: {FIGURES_DIR / 'algorithm_family_comparison.png'}")
plt.show()

In [None]:
# Cell 15: Flood-Fill Impact Analysis

# Compare models with and without flood-fill
flood_fill_comparison = []

# Find matching pairs
pairs = [
    ('DQN MLP Basic', 'DQN MLP Flood-fill'),
    ('Double DQN MLP', 'Double DQN MLP Flood-fill'),
    ('Dueling DQN MLP', 'Dueling DQN MLP Flood-fill'),
    ('PER DQN MLP', 'PER DQN MLP Flood-fill'),
    ('PPO MLP', 'PPO MLP Flood-fill'),
    ('REINFORCE MLP', 'REINFORCE MLP Flood-fill'),
]

for basic, floodfill in pairs:
    if basic in results and floodfill in results:
        flood_fill_comparison.append({
            'Algorithm': basic.replace(' MLP Basic', '').replace(' MLP', ''),
            'Without Flood-fill': results[basic]['avg_score'],
            'With Flood-fill': results[floodfill]['avg_score'],
            'Improvement': results[floodfill]['avg_score'] - results[basic]['avg_score']
        })

if flood_fill_comparison:
    df_ff = pd.DataFrame(flood_fill_comparison)
    
    fig3, ax3 = plt.subplots(figsize=(10, 6))
    
    x = np.arange(len(df_ff))
    width = 0.35
    
    bars1 = ax3.bar(x - width/2, df_ff['Without Flood-fill'], width, label='Without Flood-fill', color='#e74c3c', alpha=0.8)
    bars2 = ax3.bar(x + width/2, df_ff['With Flood-fill'], width, label='With Flood-fill', color='#2ecc71', alpha=0.8)
    
    ax3.set_xlabel('Algorithm', fontsize=12)
    ax3.set_ylabel('Average Score', fontsize=12)
    ax3.set_title('Impact of Flood-Fill Features', fontsize=14)
    ax3.set_xticks(x)
    ax3.set_xticklabels(df_ff['Algorithm'], rotation=45, ha='right')
    ax3.legend()
    ax3.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    fig3.savefig(FIGURES_DIR / 'flood_fill_impact.png', dpi=150, bbox_inches='tight')
    print(f"Saved: {FIGURES_DIR / 'flood_fill_impact.png'}")
    plt.show()
    
    print("\nFlood-Fill Impact:")
    print(df_ff.to_string(index=False))
else:
    print("No matching pairs found for flood-fill comparison.")

## Export Results

In [None]:
# Cell 17: Save Results to Files

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save full results as JSON
json_results = {}
for model_name, result in results.items():
    # Remove scores array for cleaner JSON
    clean_result = {k: v for k, v in result.items() if k != 'scores'}
    json_results[model_name] = clean_result

json_path = DATA_DIR / f'evaluation_results_{timestamp}.json'
with open(json_path, 'w') as f:
    json.dump(json_results, f, indent=2)
print(f"Saved: {json_path}")

# Save summary CSV
csv_path = DATA_DIR / f'single_snake_summary_{timestamp}.csv'
df.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")

# Also save a latest version for easy access
latest_json = DATA_DIR / 'evaluation_results_latest.json'
with open(latest_json, 'w') as f:
    json.dump(json_results, f, indent=2)
print(f"Saved: {latest_json}")

latest_csv = DATA_DIR / 'single_snake_summary_latest.csv'
df.to_csv(latest_csv, index=False)
print(f"Saved: {latest_csv}")

In [None]:
# Cell 18: Final Summary

print("\n" + "=" * 80)
print("EVALUATION COMPLETE")
print("=" * 80)

print(f"\nModels Evaluated: {len(results)}")
print(f"Episodes per Model: {NUM_EPISODES}")
print(f"Grid Size: {GRID_SIZE}x{GRID_SIZE}")

# Top 5 performers
print("\nTop 5 Performers:")
top5 = df.nlargest(5, 'Avg Score')
for i, (_, row) in enumerate(top5.iterrows(), 1):
    print(f"  {i}. {row['Model']}: {row['Avg Score']:.2f} +/- {row['Std Score']:.2f}")

# Best in each category
print("\nBest in Category:")
dqn_best = df[df['Model'].str.contains('DQN', case=False) & ~df['Model'].str.contains('Random|Shortest')].nlargest(1, 'Avg Score')
if len(dqn_best) > 0:
    print(f"  DQN Family: {dqn_best.iloc[0]['Model']} ({dqn_best.iloc[0]['Avg Score']:.2f})")

pg_best = df[df['Model'].str.contains('PPO|A2C|REINFORCE', case=False)].nlargest(1, 'Avg Score')
if len(pg_best) > 0:
    print(f"  Policy Gradient: {pg_best.iloc[0]['Model']} ({pg_best.iloc[0]['Avg Score']:.2f})")

print("\nOutput Files:")
print(f"  - Figures: {FIGURES_DIR}")
print(f"  - Data: {DATA_DIR}")
print("=" * 80)