In [1]:
import os
import torch
import gymnasium as gym
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# ---------- DQN Definition ----------
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, output_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
            module.bias.data.fill_(0.01)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)

In [2]:
# ---------- Evaluation Function ----------
def evaluate_model(model, env, episodes=10):
    rewards = []
    for _ in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        while not done:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
                q_values = model(state_tensor)
                action = q_values.argmax().item()
            next_state, reward, terminated, truncated, _ = env.step(action)
            state = next_state
            total_reward += reward
            done = terminated or truncated
        rewards.append(total_reward)
    return np.mean(rewards), np.std(rewards)

In [3]:

# ---------- Setup ----------
device = torch.device("cuda" if torch.cuda.is_available()
                      else "mps" if torch.backends.mps.is_available()
                      else "cpu")
print(f"Using device: {device}")

env = gym.make("LunarLander-v2")

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

model_dir = "models"
model_files = [f for f in os.listdir(model_dir) if f.endswith('.pth')]

results = []

Using device: mps


In [4]:
# ---------- Evaluate Each Model ----------
for filename in model_files:
    model_path = os.path.join(model_dir, filename)
    try:
        print(f"🔍 Evaluating {filename}...")
        model = DQN(state_dim, action_dim).to(device)
        checkpoint = torch.load(model_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.eval()

        mean_reward, std_reward = evaluate_model(model, env, episodes=10)
        results.append((filename, mean_reward, std_reward))

        print(f"✅ {filename}: Avg Reward = {mean_reward:.2f} ± {std_reward:.2f}")
    except Exception as e:
        print(f"⚠️ Failed to evaluate {filename}: {e}")

🔍 Evaluating checkpoint_episode_1000.pth...
✅ checkpoint_episode_1000.pth: Avg Reward = -101.44 ± 31.15
🔍 Evaluating checkpoint_episode_3000.pth...
✅ checkpoint_episode_3000.pth: Avg Reward = 65.51 ± 97.38
🔍 Evaluating checkpoint_episode_3500.pth...
✅ checkpoint_episode_3500.pth: Avg Reward = 180.55 ± 111.57
🔍 Evaluating best_avg_model.pth...
✅ best_avg_model.pth: Avg Reward = 211.54 ± 122.84
🔍 Evaluating checkpoint_episode_500.pth...
✅ checkpoint_episode_500.pth: Avg Reward = -169.29 ± 21.33
🔍 Evaluating checkpoint_episode_1500.pth...
✅ checkpoint_episode_1500.pth: Avg Reward = -3.79 ± 20.20
🔍 Evaluating best_model.pth...
✅ best_model.pth: Avg Reward = 196.07 ± 98.55
🔍 Evaluating checkpoint_episode_2000.pth...
✅ checkpoint_episode_2000.pth: Avg Reward = 22.16 ± 64.62
🔍 Evaluating checkpoint_episode_0.pth...
✅ checkpoint_episode_0.pth: Avg Reward = -622.85 ± 123.19
🔍 Evaluating checkpoint_episode_2500.pth...
✅ checkpoint_episode_2500.pth: Avg Reward = 202.49 ± 72.59
🔍 Evaluating final_

In [5]:
# ---------- Summary ----------
print("\n📊 Evaluation Summary:")
for fname, mean_r, std_r in sorted(results, key=lambda x: x[1], reverse=True):
    print(f"{fname:30s} --> Avg Reward: {mean_r:.2f} ± {std_r:.2f}")

env.close()



📊 Evaluation Summary:
final_model.pth                --> Avg Reward: 222.46 ± 70.01
best_avg_model.pth             --> Avg Reward: 211.54 ± 122.84
checkpoint_episode_2500.pth    --> Avg Reward: 202.49 ± 72.59
best_model.pth                 --> Avg Reward: 196.07 ± 98.55
checkpoint_episode_3500.pth    --> Avg Reward: 180.55 ± 111.57
checkpoint_episode_3000.pth    --> Avg Reward: 65.51 ± 97.38
checkpoint_episode_2000.pth    --> Avg Reward: 22.16 ± 64.62
checkpoint_episode_1500.pth    --> Avg Reward: -3.79 ± 20.20
checkpoint_episode_1000.pth    --> Avg Reward: -101.44 ± 31.15
checkpoint_episode_500.pth     --> Avg Reward: -169.29 ± 21.33
checkpoint_episode_0.pth       --> Avg Reward: -622.85 ± 123.19
