In [None]:
import numpy as np
import torch
import random
from pettingzoo.mpe import simple_spread_v3
from train import DQN


# Action definitions
NOTHING = 0
LEFT = 1
RIGHT = 2
DOWN = 3
UP = 4


class DQLPolicy:
    def __init__(self, agent):
        self.agent = agent
        self._load_model()

    def _load_model(self):
        input_shape = 18
        output_actions = 5
        self.model = DQN(input_shape, output_actions)
        self.model.load_state_dict(torch.load(f"models/{self.agent}.pt"))
        self.model.eval()

    def choose_action(self, observations):
        obs_tensor = torch.from_numpy(observations[self.agent]).float()
        with torch.no_grad():
            return self.model(obs_tensor).argmax().item()


def test_rl_policy(num_of_runs=5, seeds=None, max_cycles=25, local_ratio=0.5):
    env = simple_spread_v3.parallel_env(
        render_mode="human",
        local_ratio=local_ratio,
        max_cycles=max_cycles
    )

    agents = [f"agent_{i}" for i in range(3)]
    policies = {agent: DQLPolicy(agent) for agent in agents}

    for run in range(num_of_runs):
        seed = seeds[run] if seeds and run < len(seeds) else random.randint(0, 1000)
        observations, _ = env.reset(seed=seed)

        avg_rewards = {agent: 0 for agent in agents}
        actions = {agent: 0 for agent in agents}

        while env.agents:
            for agent in env.agents:
                actions[agent] = policies[agent].choose_action(observations)
            observations, rewards, _, _, _ = env.step(actions)
            for agent in env.agents:
                avg_rewards[agent] += rewards[agent] / max_cycles

        print(f"[Seed {seed}] Average Rewards: " + ", ".join([f"{a}: {avg_rewards[a]:.2f}" for a in agents]))

    env.close()


if __name__ == "__main__":
    # Example use: run 3 evaluations with fixed seeds
    test_rl_policy(num_of_runs=3, seeds=[42, 7, 123])

In [2]:
# Cell 1: Imports
from evaluation.evaluator import test

# Cell 2: Define policies and run test
policies = ["rl", "rl", "rl"]  # All agents use trained RL models
num_of_runs = 3
seeds = [73, 45, 24]  # or: None

test(policies, num_of_runs, seeds, DEBUG=True)


Policies: ['rl', 'rl', 'rl']
Runs: 3
Seeds: [73, 45, 24]
AVERAGE REWARDS [73]: {'agent_0': -0.91, 'agent_1': -0.91, 'agent_2': -0.91}
FINAL REWARDS   [73]: {'agent_0': -0.98, 'agent_1': -0.98, 'agent_2': -0.98}

AVERAGE REWARDS [45]: {'agent_0': -0.48, 'agent_1': -0.48, 'agent_2': -0.4}
FINAL REWARDS   [45]: {'agent_0': -0.43, 'agent_1': -0.43, 'agent_2': -0.43}

AVERAGE REWARDS [24]: {'agent_0': -0.72, 'agent_1': -0.74, 'agent_2': -0.74}
FINAL REWARDS   [24]: {'agent_0': -1.0, 'agent_1': -1.0, 'agent_2': -1.0}



In [3]:
# Cell 1: Imports
from evaluation.evaluator import test

# Cell 2: Run Tests
# Options: "rl" (Reinforcement Learning), "sp" (Simple Policy), "cp" (Complex Policy)
policies = ["rl", "sp", "cp"]  # agent_0 uses RL, agent_1 simple, agent_2 complex
num_of_runs = 3
seeds = [42, 1337, 7]  # optional, or use: seeds = None

test(policies, num_of_runs, seeds, DEBUG=True)

Policies: ['rl', 'sp', 'cp']
Runs: 3
Seeds: [42, 1337, 7]
AVERAGE REWARDS [42]: {'agent_0': -0.63, 'agent_1': -0.63, 'agent_2': -0.63}
FINAL REWARDS   [42]: {'agent_0': -0.48, 'agent_1': -0.48, 'agent_2': -0.48}

AVERAGE REWARDS [1337]: {'agent_0': -1.14, 'agent_1': -1.12, 'agent_2': -1.14}
FINAL REWARDS   [1337]: {'agent_0': -0.54, 'agent_1': -0.54, 'agent_2': -0.54}

AVERAGE REWARDS [7]: {'agent_0': -0.71, 'agent_1': -0.71, 'agent_2': -0.71}
FINAL REWARDS   [7]: {'agent_0': -0.64, 'agent_1': -0.64, 'agent_2': -0.64}



In [7]:
# Cell 1: Imports
from evaluation.evaluator import test

# Cell 2: Define policies and run test
policies = ["sp", "sp", "sp"]  # All agents use simple hand-coded policy
num_of_runs = 3
seeds = [69, 85, 10]  # or: None

test(policies, num_of_runs, seeds, DEBUG=True)

Policies: ['sp', 'sp', 'sp']
Runs: 3
Seeds: [69, 85, 10]
AVERAGE REWARDS [69]: {'agent_0': -0.75, 'agent_1': -0.89, 'agent_2': -0.89}
FINAL REWARDS   [69]: {'agent_0': -0.83, 'agent_1': -0.83, 'agent_2': -0.83}

AVERAGE REWARDS [85]: {'agent_0': -0.46, 'agent_1': -0.56, 'agent_2': -0.42}
FINAL REWARDS   [85]: {'agent_0': -0.15, 'agent_1': -0.15, 'agent_2': -0.15}

AVERAGE REWARDS [10]: {'agent_0': -1.22, 'agent_1': -1.18, 'agent_2': -1.1}
FINAL REWARDS   [10]: {'agent_0': -0.4, 'agent_1': -0.4, 'agent_2': -0.4}



In [9]:
# Cell 1: Imports
from evaluation.evaluator import test

# Cell 2: Define policies and run test
policies = ["cp", "cp", "cp"]  # All agents use coordination-based complex policy
num_of_runs = 3
seeds = [40, 21, 88]  # or: None

test(policies, num_of_runs, seeds, DEBUG=True)

Policies: ['cp', 'cp', 'cp']
Runs: 3
Seeds: [40, 21, 88]
AVERAGE REWARDS [40]: {'agent_0': -0.58, 'agent_1': -0.58, 'agent_2': -0.58}
FINAL REWARDS   [40]: {'agent_0': -0.34, 'agent_1': -0.34, 'agent_2': -0.34}

AVERAGE REWARDS [21]: {'agent_0': -0.41, 'agent_1': -0.41, 'agent_2': -0.41}
FINAL REWARDS   [21]: {'agent_0': -0.26, 'agent_1': -0.26, 'agent_2': -0.26}

AVERAGE REWARDS [88]: {'agent_0': -0.5, 'agent_1': -0.44, 'agent_2': -0.5}
FINAL REWARDS   [88]: {'agent_0': -0.35, 'agent_1': -0.35, 'agent_2': -0.35}

