In [14]:
from gymnasium.envs.registration import register

register(
    id="TreasureGuardian-v0",
    entry_point=r"C:\Users\malli\Desktop\Sem_6\RL\The Treasure Gaurdian\env.py:BaseTreasureGuardianEnv",
)


In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import gymnasium as gym
from copy import deepcopy
from env import LightTreasureGuardianEnv

🕹️ 3. Define Your Actor and Critic

In [16]:
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(obs_dim, 128),
            nn.ReLU(),
            nn.Linear(128, act_dim)  # Output logits for discrete actions
        )

    def forward(self, obs):
        return self.fc(obs)

class Critic(nn.Module):
    def __init__(self, total_obs_dim, total_act_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(total_obs_dim + total_act_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, obs_cat, act_cat):
        x = torch.cat([obs_cat, act_cat], dim=-1)
        return self.fc(x)


🤖 4. MADDPG Agent Class

In [17]:
class MADDPGAgent:
    def __init__(self, obs_dim, act_dim, total_obs_dim, total_act_dim, lr=1e-3):
        self.actor = Actor(obs_dim, act_dim)
        self.target_actor = deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)

        self.critic = Critic(total_obs_dim, total_act_dim)
        self.target_critic = deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr)

    def get_action(self, obs):
        obs_tensor = torch.FloatTensor(obs).unsqueeze(0)
        logits = self.actor(obs_tensor)
        probs = torch.softmax(logits, dim=-1)
        action = torch.multinomial(probs, 1).item()
        return action

    def update(self, batch, agents, agent_idx, gamma=0.95, tau=0.01):
        obs_n, act_n, rew_n, next_obs_n, done_n = batch

        obs = torch.FloatTensor([o[agent_idx] for o in obs_n])
        actions = torch.LongTensor([a[agent_idx] for a in act_n]).unsqueeze(1)
        rewards = torch.FloatTensor([r[agent_idx] for r in rew_n])
        dones = torch.FloatTensor(done_n)

        obs_cat = torch.FloatTensor([np.concatenate(o) for o in obs_n])
        act_cat = torch.FloatTensor([a for a in act_n])

        # Critic Update
        q = self.critic(obs_cat, act_cat).squeeze()

        with torch.no_grad():
            next_obs_cat = torch.FloatTensor([np.concatenate(o) for o in next_obs_n])
            next_actions = torch.stack([
                torch.softmax(agents[i].target_actor(torch.FloatTensor(o[i]).unsqueeze(0)), dim=-1).argmax(dim=-1).float()
                for i in range(len(agents))
                for o in [next_obs_n]
            ], dim=-1)
            target_q = self.target_critic(next_obs_cat, next_actions)
            target_q = rewards + gamma * target_q.squeeze() * (1 - dones)

        critic_loss = F.mse_loss(q, target_q)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor Update
        pred_actions = act_cat.clone()
        pred_actions[:, agent_idx] = torch.softmax(self.actor(obs), dim=-1).argmax(dim=-1).float()
        actor_loss = -self.critic(obs_cat, pred_actions).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft Update
        self._soft_update(self.actor, self.target_actor, tau)
        self._soft_update(self.critic, self.target_critic, tau)

    def _soft_update(self, main, target, tau):
        for tp, mp in zip(target.parameters(), main.parameters()):
            tp.data.copy_(tp.data * (1 - tau) + mp.data * tau)


📥 5. Replay Buffer

In [18]:
class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = []
        self.capacity = capacity

    def add(self, transition):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append(transition)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)


🎮 6. Initialize Agents, Environment, and Train

In [19]:
# Replace this with your environment import or registration if needed
env = gym.make("TreasureGuardian-v0")

agents = [
    MADDPGAgent(2, 4, 4, 2),  # Guardian
    MADDPGAgent(2, 4, 4, 2)   # Villains
]

buffer = ReplayBuffer()
episodes = 1000
batch_size = 64

for ep in range(episodes):
    obs, _ = env.reset()
    obs_n = [obs["guardian"], obs["villains"]]
    ep_reward = [0, 0]
    terminated = False

    while not terminated:
        actions = [agent.get_action(o) for agent, o in zip(agents, obs_n)]
        step_action = {
            "guardian": actions[0],
            "villains": np.array(actions[1])
        }

        next_obs, reward, done, _, _ = env.step(step_action)
        next_obs_n = [next_obs["guardian"], next_obs["villains"]]
        reward_n = [reward[0], reward[1]]

        buffer.add((obs_n, actions, reward_n, next_obs_n, [int(done)]))
        obs_n = next_obs_n
        terminated = done

        for i in range(2):
            ep_reward[i] += reward_n[i]

        if len(buffer) >= batch_size:
            batch = buffer.sample(batch_size)
            for i in range(2):
                agents[i].update(batch, agents, i)

    print(f"Episode {ep}: Guardian: {ep_reward[0]} | Villains: {ep_reward[1]}")


ValueError: too many values to unpack (expected 2)

💾 7. Save Models (Optional)

In [None]:
torch.save(agents[0].actor.state_dict(), "guardian_actor.pth")
torch.save(agents[1].actor.state_dict(), "villains_actor.pth")
