In [1]:
import numpy as np
import random
from collections import deque, namedtuple
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from env import TreasureGuardianEnv  # Import your game environment

In [2]:

# Replay Buffer

Transition = namedtuple("Transition", ["state", "actions", "rewards", "next_state", "done"])

class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)
    
    def add(self, state, actions, rewards, next_state, done):
        self.buffer.append(Transition(state, actions, rewards, next_state, done))
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        return Transition(*zip(*batch))
    
    def __len__(self):
        return len(self.buffer)

In [3]:
# Define Multi-Agent Training
class MultiAgentReplayBuffer:
    def __init__(self, max_size=100000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)
    
    def size(self):
        return len(self.buffer)

In [4]:
# Train MADDPG

def train_maddpg(env, n_episodes=500, batch_size=128):
    n_agents = env.n_agents
    agents = [MADDPGAgent(env.state_dim, env.action_dim) for _ in range(n_agents)]
    replay_buffer = MultiAgentReplayBuffer()
    
    for episode in range(n_episodes):
        obs = env.reset()
        done = False
        while not done:
            actions = [agent.actor(obs[i]) for i, agent in enumerate(agents)]
            next_obs, rewards, done, _ = env.step(actions)
            replay_buffer.add((obs, actions, rewards, next_obs, done))
            
            if replay_buffer.size() > batch_size:
                samples = replay_buffer.sample(batch_size)
                for idx, agent in enumerate(agents):
                    agent.update(samples, agents, idx)
            
            obs = next_obs
        print(f"Episode {episode+1} completed")
    return agents

In [5]:

# Neural Network Models

def build_actor(obs_dim, act_dim, hidden_dim=64):
    inputs = Input(shape=(obs_dim,))
    x = Dense(hidden_dim, activation='relu')(inputs)
    x = Dense(hidden_dim, activation='relu')(x)
    outputs = Dense(act_dim, activation='softmax')(x)
    return tf.keras.Model(inputs, outputs)

def build_critic(state_dim, total_act_dim, hidden_dim=128):
    inputs = Input(shape=(state_dim + total_act_dim,))
    x = Dense(hidden_dim, activation='relu')(inputs)
    x = Dense(hidden_dim, activation='relu')(x)
    outputs = Dense(1)(x)
    return tf.keras.Model(inputs, outputs)


In [6]:

# MADDPG Agent

class MADDPGAgent:
    def __init__(self, agent_id, obs_dim, act_dim, state_dim, total_act_dim,
                 actor_lr=1e-3, critic_lr=1e-3, gamma=0.95, tau=0.01):
        self.agent_id = agent_id
        self.gamma = gamma
        self.tau = tau
        
        self.actor = build_actor(obs_dim, act_dim)
        self.critic = build_critic(state_dim, total_act_dim)
        self.target_actor = build_actor(obs_dim, act_dim)
        self.target_critic = build_critic(state_dim, total_act_dim)
        
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())
        
        self.actor_optimizer = Adam(learning_rate=actor_lr)
        self.critic_optimizer = Adam(learning_rate=critic_lr)
    
    def select_action(self, obs):
        obs = np.expand_dims(obs, axis=0)
        action_probs = self.actor.predict(obs, verbose=0)[0]
        return np.argmax(action_probs), action_probs
    
    def update(self, samples, all_agents, agent_index):
        states, actions, rewards, next_states, dones = samples
        states = np.array(states)
        next_states = np.array(next_states)
        actions_tensor = np.concatenate(actions, axis=1)
        
        next_actions = [agent.target_actor.predict(next_states, verbose=0) for agent in all_agents]
        next_actions_cat = np.concatenate(next_actions, axis=1)
        target_q = self.target_critic.predict(np.hstack([next_states, next_actions_cat]), verbose=0)
        y = rewards[:, agent_index].reshape(-1, 1) + self.gamma * target_q * (1 - dones.reshape(-1, 1))
        
        with tf.GradientTape() as tape:
            q_values = self.critic(np.hstack([states, actions_tensor]), training=True)
            critic_loss = tf.reduce_mean(tf.square(y - q_values))
        grads = tape.gradient(critic_loss, self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(grads, self.critic.trainable_variables))
        
        with tf.GradientTape() as tape:
            current_action = self.actor(states, training=True)
            actions_modified = np.copy(actions)
            actions_modified[agent_index] = current_action
            actions_concat = np.concatenate(actions_modified, axis=1)
            actor_loss = -tf.reduce_mean(self.critic(np.hstack([states, actions_concat])))
        grads = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(grads, self.actor.trainable_variables))
        
        for target_param, param in zip(self.target_actor.trainable_variables, self.actor.trainable_variables):
            target_param.assign(self.tau * param + (1 - self.tau) * target_param)
        for target_param, param in zip(self.target_critic.trainable_variables, self.critic.trainable_variables):
            target_param.assign(self.tau * param + (1 - self.tau) * target_param)
        
        return critic_loss.numpy(), actor_loss.numpy()


In [7]:

# Training Loop

def train_maddpg(env, n_episodes=500, batch_size=128):
    replay_buffer = ReplayBuffer()
    init_obs = env.reset()
    state_dim = np.prod(init_obs.shape)  # Ensure correct shape
    act_dim = 4  # Discrete actions
    total_agents = 1 + env.num_villains
    total_act_dim = total_agents * act_dim
    
    agents = [MADDPGAgent(i, state_dim, act_dim, state_dim, total_act_dim) for i in range(total_agents)]
    
    for episode in range(n_episodes):
        obs = env.reset()
        global_state = obs.flatten()
        done = False
        step = 0
        
        while not done:
            actions_list = []
            actions_onehots = []
            for agent in agents:
                a_int, a_onehot = agent.select_action(global_state)
                actions_list.append(a_int)
                actions_onehots.append(a_onehot)
            
            env_actions = {"guardian": actions_list[0], "villains": np.array(actions_list[1:])}
            next_obs, rewards, done, _ = env.step(env_actions)  # Handle different env output
            
            replay_buffer.add(global_state, np.array(actions_onehots), np.array(rewards), next_obs.flatten(), done)
            global_state = next_obs.flatten()
            step += 1
            
            if len(replay_buffer) > batch_size and step % 100 == 0:
                samples = replay_buffer.sample(batch_size)
                for idx, agent in enumerate(agents):
                    agent.update(samples, agents, idx)
        
        print(f"Episode {episode+1} completed")
    
    return agents

In [8]:
if __name__ == '__main__':
    env = TreasureGuardianEnv(grid_size=10, num_villains=3, num_keys=5, num_pits=3, render_mode=None, max_steps=200)
    device = "/GPU:0" if tf.config.list_physical_devices('GPU') else "/CPU:0"

    # Ensure proper observation retrieval
    init_obs = env.reset()  # If reset() returns a dict, extract relevant state
    if isinstance(init_obs, dict):
        init_obs = init_obs["observation"]  # Change this key based on your env structure

    with tf.device(device):
        trained_agents = train_maddpg(env, n_episodes=500, batch_size=128)

    env.close()


KeyError: 'observation'