In [5]:
# Cell 1: Imports
import numpy as np
import torch
import matplotlib.pyplot as plt
from env import TreasureGuardianEnv
from maddpg import MADDPG

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [6]:
env = TreasureGuardianEnv()
MAX_VILLAINS = 2  # or whatever maximum number of villains you want to support
MAX_KEYS = 3          # max number of keys that can appear in the game
MAX_WALLS = 20        # estimate based on your map
MAX_PITS = 5          # estimated upper bound on pit count


print("Observation Space:", env.observation_space)
print("Observation Space Type:", type(env.observation_space))


Observation Space: Dict('guardian': Box(0, 9, (2,), int32), 'keys': Box(0, 9, (3, 2), int32), 'pits': Box(0, 9, (2, 2), int32), 'treasure': Box(0, 9, (2,), int32), 'villains': Box(0, 9, (1, 2), int32), 'walls': Box(0, 9, (15, 2), int32))
Observation Space Type: <class 'gymnasium.spaces.dict.Dict'>


In [8]:
env = TreasureGuardianEnv()
num_agents = 1 + env.num_villains  # Guardian + Villains
maddpg = MADDPG(env=env, num_agents=num_agents)
print("MADDPG initialized successfully!")


TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)
 * (tuple of ints size, *, torch.memory_format memory_format = None, Tensor out = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)


In [4]:
print("Observation Space Keys:", env.observation_space.spaces.keys())


Observation Space Keys: dict_keys(['guardian', 'keys', 'pits', 'treasure', 'villains', 'walls'])


In [5]:
n_episodes = 3000
max_steps = 100
reward_log = []

def pad_villain_obs(villain_obs, max_villains=MAX_VILLAINS):
    villain_obs = np.array(villain_obs)

    if villain_obs.ndim == 1:
        villain_obs = np.expand_dims(villain_obs, axis=0)

    k = villain_obs.shape[0]
    obs_dim = villain_obs.shape[1] if villain_obs.ndim == 2 else 0

    if k < max_villains:
        pad = np.full((max_villains - k, obs_dim), -1, dtype=villain_obs.dtype)
        return np.concatenate([villain_obs, pad], axis=0)
    elif k > max_villains:
        return villain_obs[:max_villains]  # Optional truncation
    else:
        return villain_obs


In [6]:
def flatten_obs(obs_dict, max_villains, max_keys, max_walls, max_pits):
    def pad(arr, max_len):
        if len(arr) == 0:
            return np.zeros((max_len, 2), dtype=np.float32)
        return np.vstack([arr, np.zeros((max_len - len(arr), 2), dtype=np.float32)])[:max_len]

    guardian = np.array(obs_dict['guardian'], dtype=np.float32).flatten()
    villains = pad(obs_dict['villains'], max_villains).flatten()
    keys = pad(obs_dict['keys'], max_keys).flatten()
    walls = pad(obs_dict['walls'], max_walls).flatten()
    pits = pad(obs_dict['pits'], max_pits).flatten()
    treasure = np.array(obs_dict['treasure'], dtype=np.float32).flatten()
    
    return np.concatenate([guardian, villains, keys, walls, pits, treasure])


In [8]:
for ep in range(1, n_episodes + 1):
    obs_raw = env.reset()
    guardian_obs, villains_obs_dict = obs_raw

    # Ensure villain observations are converted to list format
    villains_obs_list = [
        np.array(villains_obs_dict[i], dtype=np.float32)  # ensure float array
        for i in sorted(villains_obs_dict.keys())
    ]

    print("guardian_obs: ", guardian_obs)
    print("villains_obs_list", villains_obs_list)
    
    # Padding (if needed) — already a list of arrays now
    villains_obs = pad_villain_obs(villains_obs_list, MAX_VILLAINS)

    # Check if guardian_obs is a dict
    if isinstance(guardian_obs, dict):
        print("guardian_obs is a dict. Keys:", guardian_obs.keys())
    
    guardian_flat = flatten_obs(guardian_obs, MAX_VILLAINS, MAX_KEYS, MAX_WALLS, MAX_PITS)
    villains_flat = [flatten_obs(v_obs, MAX_VILLAINS, MAX_KEYS, MAX_WALLS, MAX_PITS) for v_obs in villains_obs_list]
    
    # Pad if fewer villains
    while len(villains_flat) < MAX_VILLAINS:
        villains_flat.append(np.zeros_like(guardian_flat))
    
    obs = [guardian_flat] + villains_flat
    
    total_reward = np.zeros(1 + MAX_VILLAINS)

    print("Guardian obs shape:", guardian_flat.shape)
    print("Villain obs shape:", villains_flat[0].shape)
    print("Full obs shape:", np.array(obs).shape)
    

    for step in range(max_steps):
        action_list = maddpg.act(obs)

        actions = {
            "guardian": action_list[0],
            "villains": action_list[1:1 + env.num_villains]
        }

        next_obs_raw, rewards_raw, done_raw, _ = env.step(actions)
        guardian_next, villains_next = next_obs_raw

        villains_next = pad_villain_obs(villains_next, MAX_VILLAINS)
        next_obs = [guardian_next] + [villains_next[i] for i in range(MAX_VILLAINS)]

        # Unpack rewards
        guardian_reward, villain_reward = rewards_raw
        rewards = [guardian_reward] + [villain_reward for _ in range(MAX_VILLAINS)]

        done = [done_raw] * (1 + MAX_VILLAINS)

        maddpg.step(obs, action_list, rewards, next_obs, done)

        obs = next_obs
        total_reward += np.array(rewards)

        if any(done):
            break

    reward_log.append(total_reward)

    if ep % 100 == 0:
        avg_rewards = np.mean(reward_log[-100:], axis=0)
        print(f"Episode {ep} - Avg Reward: {avg_rewards}")



guardian_obs:  {'guardian': array([0, 0]), 'villains': array([[2, 4]]), 'keys': array([[0, 2],
       [3, 4],
       [1, 1]]), 'walls': array([[0, 1],
       [4, 0],
       [6, 0],
       [4, 9],
       [9, 6],
       [0, 3],
       [9, 2],
       [7, 3],
       [7, 6],
       [5, 0],
       [3, 6],
       [6, 6],
       [5, 9],
       [3, 2],
       [1, 9]]), 'treasure': array([2, 1]), 'pits': array([[9, 7],
       [3, 3]])}
villains_obs_list []
guardian_obs is a dict. Keys: dict_keys(['guardian', 'villains', 'keys', 'walls', 'treasure', 'pits'])
Guardian obs shape: (64,)
Villain obs shape: (64,)
Full obs shape: (3, 64)


NameError: name 'maddpg' is not defined

In [None]:
# Cell 4: Plot Rewards

reward_log = np.array(reward_log)
plt.figure(figsize=(12, 6))
for i in range(num_agents):
    plt.plot(reward_log[:, i], label=f"Agent {i}")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("Training Rewards per Agent")
plt.legend()
plt.grid()
plt.show()


In [None]:
# Cell 5: Save Trained Models
maddpg.save("maddpg_models/")
print("Models saved to 'maddpg_models/'")
