# IMPORTS

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import copy
from collections import deque, namedtuple
import torch.optim as optim
import time
import sys
from unityagents import UnityEnvironment

In [None]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return -lim, lim

# Actor and Critic Models

In [None]:
class Actor(nn.Module): #Actor model
    def __init__(self, state_dim, hidden, action_dim, seed):
        super().__init__()
        torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_dim, hidden[0])
        self.bn1 = nn.BatchNorm1d(hidden[0])
        self.fc2 = nn.Linear(hidden[0],hidden[1])
        self.fc3 = nn.Linear(hidden[1], action_dim)
        self.reset_parameters() #To reset parameters

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3,3e-3)

    def forward(self, state): #forward function
        x = F.relu(self.bn1(self.fc1(state)))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))

In [None]:
class Critic(nn.Module): #critic model
    def __init__(self, state_dim, hidden, action_dim, seed):
        super().__init__()
        torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_dim, hidden[0])
        self.bn1 = nn.BatchNorm1d(hidden[0])
        self.fc2 = nn.Linear(hidden[0] + action_dim, hidden[1])
        self.dropout = nn.Dropout(p=0.1*2)
        self.fc3 = nn.Linear(hidden[1], action_dim)
        self.reset_parameters() #T reset parameters

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3,3e-3)

    def forward(self, state, action): #forward function
        xs = F.relu(self.bn1(self.fc1(state)))
        x = torch.cat((xs, action), dim=1)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)

# Replay Buffer

In [None]:
class OUNoise: #
    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.1):
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        random.seed(seed)
        self.reset()

    def reset(self):
        self.state = copy.copy(self.mu)

    def sample(self):
        dx = self.theta *(self.mu - self.state)+self.sigma * np.random.rand(len(self.state))
        self.state+=dx
        return self.state

class ReplayBuffer:
    def __init__(self, action_dim, buffer_size, batch_size, seed):
        self.memory = deque(maxlen=buffer_size)
        self.batch_size =batch_size
        self.experience = namedtuple("Experience",field_names=["state","action", "reward","next_state", "done"])
        random.seed(seed)

    def add(self, state, action,reward, next_state, done):
        self.memory.append(self.experience(state, action, reward, next_state, done))

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.FloatTensor(np.vstack([e.state for e in experiences])).to(device)
        actions = torch.FloatTensor(np.vstack([e.action for e in experiences])).to(device)
        rewards = torch.FloatTensor(np.vstack([e.reward for e in experiences])).to(device)
        next_states = torch.FloatTensor(np.vstack([e.next_state for e in experiences])).to(device)
        dones = torch.FloatTensor(np.vstack([e.done for e in experiences]).astype(np.uint8)).to(device)
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.memory)

# Agent code that combines all previous models

In [None]:
class Agent:
    def __init__(self, state_dim, action_dim, replay_buffer, batch_size, seed):
        self.state_dim = state_dim
        self.batch_size = batch_size
        self.action_dim = action_dim
        
        random.seed(seed)
        self.actor_local = Actor(state_dim, (128, 64), action_dim, seed).to(device)
        self.actor_target = Actor(state_dim, (128, 64), action_dim, seed).to(device)
        self.critic_local = Critic(state_dim, (128, 64), action_dim, seed).to(device)
        self.critic_target = Critic(state_dim, (128, 64), action_dim, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=1e-4)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=1e-4, weight_decay=1e-6)
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())
        self.noise = OUNoise(action_dim, seed)
        self.memory = replay_buffer

    def act(self, state, noise=True):
        state = torch.FloatTensor(state).to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (0.99 * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        self.soft_update(self.critic_local, self.critic_target, 1e-3)
        self.soft_update(self.actor_local, self.actor_target, 1e-3)

    def reset(self):
        self.noise.reset()

    def soft_update(self, local, target, tau):
        for target_param, local_param in zip(target.parameters(), local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)


# Seperate Agent for each player

In [None]:
class MultiAgent:
    def __init__(self, num_agents,state_dim, action_dim, seed):
        self.num_agents = num_agents
        self.memory = ReplayBuffer(action_dim,int(1e5),128,seed)
        self.agents = [Agent(state_dim, action_dim, self.memory,128, seed) for _ in range(num_agents)]

    def act(self, states):
        return [self.agents[i].act(np.array([states[i]])) for i in range(self.num_agents)]

    def step(self, states, actions, rewards, next_states, dones):
        [self.agents[i].step(states[i], actions[i], rewards[i], next_states[i], dones[i]) for i in range(self.num_agents)]

    def reset(self):
        [agent.reset() for agent in self.agents]

    def save(self, key):
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(), f'actor_{i}_{key}.pth')
            torch.save(agent.critic_local.state_dict(), f'critic_{i}_{key}.pth')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Environment Initialization using Unity

In [None]:
env = UnityEnvironment(file_name="Tennis.x86")

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]

In [None]:
NUM_STATES = env_info.vector_observations.shape[1];
NUM_ACTIONS = brain.vector_action_space_size;
NUM_AGENTS = len(env_info.agents)

# Agent Init

In [None]:
agent = MultiAgent(num_agents=2, state_dim=NUM_STATES, action_dim=NUM_ACTIONS, seed=10)

# Multi agent training loop

In [None]:
def multi_ddpg(n_episodes=5000, max_t=2000):
    start_time = time.time()
    recent_scores = deque(maxlen=100)
    all_scores = []
    best_avg_score = -np.inf

    #Main training loops
    for episode in range(1, n_episodes +1):
        episode_start = time.time()
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations

        agent.reset()
        episode_scores = np.zeros(2)

        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done

            agent.step(states, actions, rewards, next_states, dones)
            episode_scores += rewards
            states = next_states

            if np.any(dones):
                break

        max_agent_score = np.max(episode_scores)
        recent_scores.append(max_agent_score)
        all_scores.append(max_agent_score)

        avg_score = np.mean(recent_scores)
        elapsed = time.time() - start_time
        episode_time = time.time() - episode_start

        print(
            f'Episode {episode:4d} | Time: {episode_time:.2f}s / {elapsed:.2f}s | '
            f'Scores: {", ".join(f"{s:.2f}" for s in episode_scores)} | '
            f'Avg (100) Score: {avg_score:.2f}',
            end='\r'
        )

        if episode % 100 == 0 or episode == n_episodes:
            print(
                f'\n[Checkpoint] Episode {episode:4d} | Time: {episode_time:.2f}s / {elapsed:.2f}s | '
                f'Scores: {", ".join(f"{s:.2f}" for s in episode_scores)} | '
                f'Avg (100) Score: {avg_score:.2f}'
            )

        if episode > 100 and avg_score >= 0.5:
            if avg_score > best_avg_score:
                best_avg_score = avg_score
                agent.save(episode)
            print(
                f'[Solved] Episode{episode:  # ← FIXED here!4d} | Best Avg Score: {avg_score:.2f}'
            )

    return all_scores #returning scores. This is basically same as reward


# Training

In [None]:
#Training Model
print('Starting to');
scores = multi_ddpg()

# DQN for Evaluation

In [None]:
class DQN(nn.Module): # A simple DQN model
    def __init__(self):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(24,128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 4)
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent: #DQN Agent Class which contains all requirements for agent
    def __init__(self, seed=0, load_path=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.q_network = DQN().to(self.device)
        self.target_network = DQN().to(self.device)
        self.optimizer = optim.Adam(self.q_network.parameters(),lr=0.0005)
        if load_path and os.path.exists(load_path):
            self.q_network.load_state_dict(torch.load(load_path))
            self.target_network.load_state_dict(torch.load(load_path))
        for target_param, local_param in zip(self.target_network.parameters(), self.q_network.parameters()):
            target_param.data.copy_(local_param.data)
        #Hyperparametrs
        self.replay_mem = deque(maxlen=100000)
        self.bSize =128
        self.gamma= 0.99
        self.max_ep =1.0
        self.minimum_ep= 0.01
        self.decay_ep =0.999
        self.target_up =1
        self.step_count =0
        torch.manual_seed(seed)
        random.seed(seed)
    
    def do_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        self.q_network.eval()
        with torch.no_grad():
            action_values =self.q_network(state)
        self.q_network.train()
        if random.random() <self.max_ep:
            return random.randint(0,len(discrete_actions)-1)
        return np.argmax(action_values.cpu().data.numpy())
    
    def step(self, state, action, reward, next_state, done):
        self.replay_mem.append((state,action, reward, next_state, done))
        self.step_count+= 1
        if len(self.replay_mem)>=self.bSize and self.step_count %self.target_up==0:
            self.train()
    
    def train(self):
        experiences = random.sample(self.replay_mem, self.bSize) #Sampling from replay memory
        states, actions, rewards, next_states, dones =zip(*experiences)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        q_values = self.q_network(states).gather(1,  actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_network(next_states).detach().max(1)[0]
        targets = rewards + (self.gamma * next_q_values * (1 - dones))
        loss = nn.MSELoss()(q_values, targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        for target_param, local_param in zip(self.target_network.parameters(), self.q_network.parameters()):
            target_param.data.copy_(local_param.data)
        self.max_ep = max(self.minimum_ep, self.max_ep * self.decay_ep)
        
    def save_model(self, path): #Saving the model after every 5000 episodes
        torch.save(self.q_network.state_dict(), path)
        print(f"Saved model to {path}")


# DDQN for Evaluation

In [None]:
class DDQN(nn.Module):
    def __init__(self):
        super(DDQN, self).__init__()
        self.fc1 = nn.Linear(24, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 4)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DDQNAgent:
    def __init__(self, seed=0, load_path=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.q_network = DDQN().to(self.device)
        self.target_network = DDQN().to(self.device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=0.0005)
        if load_path and os.path.exists(load_path):
            self.q_network.load_state_dict(torch.load(load_path))
            self.target_network.load_state_dict(torch.load(load_path))
        for target_param, local_param in zip(self.target_network.parameters(), self.q_network.parameters()):
            target_param.data.copy_(local_param.data)
        self.replay_mem = deque(maxlen=100000)
        self.bSize = 128
        self.gamma = 0.99
        self.max_ep = 1.0
        self.minimum_ep = 0.01
        self.decay_ep = 0.999
        self.target_up = 1
        self.step_count = 0
        torch.manual_seed(seed)
        random.seed(seed)

    def do_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        self.q_network.eval()
        with torch.no_grad():
            action_values = self.q_network(state)
        self.q_network.train()
        if random.random() < self.max_ep:
            return random.randint(0, 3)
        return np.argmax(action_values.cpu().data.numpy())

    def step(self, state, action, reward, next_state, done):
        self.replay_mem.append((state, action, reward, next_state, done))
        self.step_count += 1
        if len(self.replay_mem) >= self.bSize and self.step_count % self.target_up == 0:
            self.train()

    def train(self):
        experiences = random.sample(self.replay_mem, self.bSize)
        states, actions, rewards, next_states, dones = zip(*experiences)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        q_values = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        next_actions = self.q_network(next_states).max(1)[1].unsqueeze(1)
        next_q_values = self.target_network(next_states).gather(1, next_actions).squeeze(1).detach()

        targets = rewards + (self.gamma * next_q_values * (1 - dones))

        loss = nn.MSELoss()(q_values, targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        for target_param, local_param in zip(self.target_network.parameters(), self.q_network.parameters()):
            target_param.data.copy_(local_param.data)
        self.max_ep = max(self.minimum_ep, self.max_ep * self.decay_ep)


    def save_model(self, path):
        torch.save(self.q_network.state_dict(), path)
        print(f"Saved model to {path}")


# Evaluation Code

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import os

discrete_actions = [(-1,0),(1,0),(0,1),(0,-1)]

dqn_agents = [DQNAgent(seed=0, load_path='dqn_agent_0_ep6000.pth'),
              DQNAgent(seed=1, load_path='dqn_agent_1_ep6000.pth')]
ddqn_agents = [DDQNAgent(seed=0, load_path='ddqn_agent_0_ep10000.pth'),
              DDQNAgent(seed=1, load_path='ddqn_agent_1_ep10000.pth')]
for agent in dqn_agents:
    agent.max_ep = 0.0  # greedy actions

maddpg_agent = MultiAgent(num_agents=2, state_dim=NUM_STATES, action_dim=NUM_ACTIONS,seed=10)
maddpg_agent.agents[0].actor_local.load_state_dict(torch.load('updated_actor_0.pth'))
maddpg_agent.agents[1].actor_local.load_state_dict(torch.load('updated_actor_1.pth'))
maddpg_agent.agents[0].actor_local.eval()
maddpg_agent.agents[1].actor_local.eval()

def test_dqn_agents(agents, episodes=100):
    per_agent_rewards = [[] for _ in range(len(agents))]
    for ep in range(episodes):
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        episode_rewards = np.zeros(len(agents))
        for _ in range(1000):
            actions_idx = [agent.do_action(states[i]) for i, agent in enumerate(agents)]
            actions = np.array([discrete_actions[idx] for idx in actions_idx])
            env_info = env.step(actions)[brain_name]
            states = env_info.vector_observations
            episode_rewards += env_info.rewards
            if np.any(env_info.local_done):
                break
        for i in range(len(agents)):
            per_agent_rewards[i].append(episode_rewards[i])
        print(f"DQN Episode {ep +1}/{episodes} | Agent 0: {episode_rewards[0]:.3f}, Agent 1: {episode_rewards[1]:.3f}")
    return per_agent_rewards

def test_ddqn_agents(agents, episodes=100):
    per_agent_rewards = [[] for _ in range(len(agents))]
    for ep in range(episodes):
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        episode_rewards = np.zeros(len(agents))
        for _ in range(1000):
            actions_idx = [agent.do_action(states[i]) for i, agent in enumerate(agents)]
            actions = np.array([discrete_actions[idx] for idx in actions_idx])
            env_info = env.step(actions)[brain_name]
            states = env_info.vector_observations
            episode_rewards += env_info.rewards
            if np.any(env_info.local_done):
                break
        for i in range(len(agents)):
            per_agent_rewards[i].append(episode_rewards[i])
        print(f"DDQN Episode {ep +1}/{episodes} | Agent 0: {episode_rewards[0]:.3f}, Agent 1: {episode_rewards[1]:.3f}")
    return per_agent_rewards

def test_maddpg_agent(agent, episodes=100):
    per_agent_rewards = [[] for _ in range(agent.num_agents)]
    for ep in range(episodes):
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        episode_rewards = np.zeros(agent.num_agents)
        while True:
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            episode_rewards += rewards
            states = next_states
            if np.any(dones):
                break
        for i in range(agent.num_agents):
            per_agent_rewards[i].append(episode_rewards[i])
        print(f"MADDPG Episode {ep +1}/{episodes} | Agent 0: {episode_rewards[0]:.3f}, Agent 1: {episode_rewards[1]:.3f}")
    return per_agent_rewards

dqn_per_agent_rewards = test_dqn_agents(dqn_agents, episodes=5)
ddqn_rewards = test_ddqn_agents(ddqn_agents, episodes=5)
maddpg_per_agent_rewards = test_maddpg_agent(maddpg_agent, episodes=5)

#Plotting Results
plt.figure(figsize=(10,6))
for i, rewards in enumerate(dqn_per_agent_rewards):
    plt.plot(range(1, len(rewards)+1), rewards, label=f'DQN Agent {i}')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('DQN Per-Agent Reward per Episode')
plt.legend()
plt.show()

plt.figure(figsize=(10,6))
for i, rewards in enumerate(ddqn_rewards):
    plt.plot(range(1, len(rewards)+1), rewards, label=f'DDQN Agent {i}')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('DDQN Per-Agent Reward per Episode')
plt.legend()
plt.show()

plt.figure(figsize=(10,6))
for i, rewards in enumerate(maddpg_per_agent_rewards):
    plt.plot(range(1, len(rewards)+1), rewards, label=f'MADDPG Agent {i}')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('MADDPG Per-Agent Reward per Episode')
plt.legend()
plt.show()
