# Shit needed to make this thing work

## Setting up Gym Particles

In [None]:
%%capture
!pip install pettingzoo[mpe]
!pip3 install box2d-py

## Needed Libraries

In [None]:
import tensorflow as tf
import os
import tensorflow_probability as tfp
import threading
import random
import numpy as np
import gym
import copy
from tensorflow.keras import backend as K
from pettingzoo.mpe import simple_adversary_v2
from google.colab import files


# Actor, Critic Networks

## Actor Network

In [None]:
class ActorNetwork(tf.keras.Model):
    def __init__(self, output_dims, id):
        super(ActorNetwork, self).__init__()
        self.output_dims = output_dims
        # Create a checkpoint directory in case we want to save our model
        name = 'Actor'
        self.model_name = name + f' {id}'

        checkpoint_directory = f'{os.getcwd()}//Agent Models'
        self.checkpoint_dir = checkpoint_directory
        self.checkpoint_file = os.path.join(self.checkpoint_dir, self.model_name + '.h5')

        self.dense_layer_1 = tf.keras.layers.Dense(units=2048, activation='relu', name='Dense_Layer_1',
                                                   dtype=tf.float64)
        self.dense_layer_2 = tf.keras.layers.Dense(units=1024, activation='relu', name='Dense_Layer_2',
                                                   dtype=tf.float64)
        self.dense_layer_3 = tf.keras.layers.Dense(units=512, activation='relu', name='Dense_Layer_1',
                                                   dtype=tf.float64)
        self.action_probs = tf.keras.layers.Dense(units=self.output_dims, activation=None, name='Action_Logits',
                                                  dtype=tf.float64)

    def call(self, state):
        x = self.dense_layer_1(state)
        x = self.dense_layer_2(x)
        x = self.dense_layer_3(x)
        action_probs = self.action_probs(x)
        return action_probs

## Critic Network

In [None]:
class CriticNetwork(tf.keras.Model):
    def __init__(self, output_dims, id):
        super(CriticNetwork, self).__init__()
        self.output_dims = output_dims
        # Create a checkpoint directory in case we want to save our model
        name = 'Critic'
        self.model_name = name + f' {id}'

        checkpoint_directory = f'{os.getcwd()}//Agent Models'
        self.checkpoint_dir = checkpoint_directory
        self.checkpoint_file = os.path.join(self.checkpoint_dir, self.model_name + '.h5')

        self.dense_layer_1 = tf.keras.layers.Dense(units=2048, activation='relu', name='Dense_Layer_1',
                                                   dtype=tf.float64)
        self.dense_layer_2 = tf.keras.layers.Dense(units=1024, activation='relu', name='Dense_Layer_2',
                                                   dtype=tf.float64)
        self.dense_layer_3 = tf.keras.layers.Dense(units=512, activation='relu', name='Dense_Layer_1',
                                                   dtype=tf.float64)
        self.state_value = tf.keras.layers.Dense(units=1, activation=None, name='State_Value',
                                                 dtype=tf.float64)

    def call(self, state):
        x = self.dense_layer_1(state)
        x = self.dense_layer_2(x)
        x = self.dense_layer_3(x)
        state_value = self.state_value(x)
        return state_value

# Agent

In [None]:
class Agent:
    def __init__(self, output_dims, initial_hyper_parameters, id):
        # Agent's parameters needed for logging
        self.id = id
        self.cum_sum = 0
        self.episode_num = 0

        # Agent's initial hyper-parameters
        self.hyper_parameters = initial_hyper_parameters

        # These are the parameters we want to use with population based training
        self.actor_learning_rate = self.hyper_parameters['actor_learning_rate']
        self.critic_learning_rate = self.hyper_parameters['critic_learning_rate']

        # We're going to use one network for all of our minions
        self.actor_network = ActorNetwork(output_dims=output_dims, id=self.id)
        self.critic_network = CriticNetwork(output_dims=1, id=self.id)

        self.actor_network.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.actor_learning_rate))
        self.critic_network.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.critic_learning_rate))

        # Since Actor-Critic is an on-policy method, we will not use a replay buffer
        self.states = []
        self.actions = []
        self.rewards = []
        self.episode_rewards = []
        self.scores = []
        self.actor_losses = []
        self.critic_losses = []

    def save_models(self):
        # print('... saving models ...')
        self.actor_network.save_weights(self.actor_network.checkpoint_file)
        self.critic_network.save_weights(self.critic_network.checkpoint_file)

    def load_models(self):
        # print('... loading models ...')
        self.actor_network.load_weights(self.actor_network.checkpoint_file)
        self.critic_network.load_weights(self.critic_network.checkpoint_file)

    def choose_action(self, state):
        action_logits = self.actor_network(tf.convert_to_tensor([state]))
        action_probabilities = tf.nn.softmax(action_logits)
        action_distribution = tfp.distributions.Categorical(probs=action_probabilities, dtype=tf.float32)
        action = action_distribution.sample()

        return int(action.numpy()[0])

    def learn(self):
        discounted_rewards = []
        sum_reward = 0
        self.rewards.reverse()
        for r in self.rewards:
            sum_reward = r + self.hyper_parameters['discount_factor'] * sum_reward
            discounted_rewards.append(sum_reward)
        discounted_rewards.reverse()

        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            # Start calculating the Actor and Critic losses for each minion's experience
            action_logits = self.actor_network(tf.convert_to_tensor(self.states))
            state_values = self.critic_network(tf.convert_to_tensor(self.states))
            action_probabilities = tf.nn.softmax(action_logits)
            # We'll be using an advantage function
            action_distributions = tfp.distributions.Categorical(probs=action_probabilities, dtype=tf.float32)
            log_probs = action_distributions.log_prob(self.actions)
            advantage = tf.math.subtract(discounted_rewards, state_values)
            entropy = -1 * tf.math.reduce_sum(action_probabilities * tf.math.log(action_probabilities))
            actor_loss = tf.math.reduce_mean(-1 * log_probs * advantage) - self.hyper_parameters[
                'entropy_coefficient'] * entropy
            critic_loss = tf.math.reduce_mean(advantage ** 2)

            # Optimize master's network with the mean of all the losses
        actor_grads = tape1.gradient(actor_loss, self.actor_network.trainable_variables)
        critic_grads = tape2.gradient(critic_loss, self.critic_network.trainable_variables)
        self.actor_network.optimizer.apply_gradients(zip(actor_grads, self.actor_network.trainable_variables))
        self.critic_network.optimizer.apply_gradients(zip(critic_grads, self.critic_network.trainable_variables))
        self.actor_losses.append(actor_loss.numpy())
        self.critic_losses.append(critic_loss.numpy())

# Coordinator

In [None]:
class Coordinator:
    def __init__(self, environment_name, initial_hyper_parameters, coordinator_id, log_file_name):
        self.environment_name = environment_name
        self.id = coordinator_id
        self.log_file_name = log_file_name
        self.environment = simple_adversary_v2.env(N=2, max_cycles=100)
        self.observation = self.environment.reset()
        self.number_of_agents = len(self.environment.agents)

        self.hyper_parameters = initial_hyper_parameters
        self.agents = [Agent(self.environment.action_spaces[i].n, self.hyper_parameters, id=i) for i in
                       self.environment.action_spaces.keys()]

        self.episode_finished = False
        self.episode_num = 0
        self.mean_scores = 0
        self.episode_number = 0
        self.episode_rewards = []
        self.total_reward = 0
        self.episode_finished = False
        self.steps = 1

    def play(self, show_env=False):
        self.episode_finished = False
        self.environment.reset()
        while not all(done == True for done in self.environment.dones):
            i = 0
            for agent in self.environment.agent_iter():

                agent_obs, agent_reward, agent_done, agent_info = self.environment.last()
                self.agents[i].rewards.append(agent_reward)

                self.agents[i].states.append(agent_obs)
                temp_action = self.agents[i].choose_action(agent_obs)

                if agent_done:
                    self.agents[i].actions.append(0)
                    self.environment.step(None)
                else:
                    self.agents[i].actions.append(temp_action)
                    self.environment.step(temp_action)

                if show_env:
                    self.environment.render(mode='human')
                i += 1
                if i == self.number_of_agents: i = 0

        self.episode_num += 1
        total_episode_reward = 0
        for agent in self.agents[:1]:
            total_episode_reward += np.sum(agent.rewards)
        coordinator.episode_rewards.append(total_episode_reward)
        for agent in self.agents:
            f = open(f'{self.environment_name}-{self.log_file_name}.csv', 'a')
            f.write(
                f'{self.id},{self.episode_num},{total_episode_reward},{agent.id},{np.sum(agent.rewards)},'
                f'{agent.hyper_parameters["actor_learning_rate"]},{agent.hyper_parameters["critic_learning_rate"]},'
                f'{agent.hyper_parameters["entropy_coefficient"]}\n')
            f.close()

        for agent in self.agents:
            agent.learn()
            agent.states.clear()
            agent.rewards.clear()
            agent.actions.clear()

# PBT Functions

In [None]:
def exploit(population):
    sorted_population = sorted(population, key=lambda i: np.mean(i.episode_rewards), reverse=True)
    best_coordinators = sorted_population[:3]
    worst_coordinators = sorted_population[-3:]

    # for each other agent, load their models here
    for coordinator in worst_coordinators:
        worst_coordinator_id = coordinator.id
        worst_coordinator_episode = coordinator.episode_num
        new_coordinator = copy.deepcopy(random.choice(best_coordinators))
        print(f'Agent -> {new_coordinator.id} will replace {worst_coordinator_id}')
        new_coordinator.id = worst_coordinator_id
        new_coordinator.episode_num = worst_coordinator_episode
        population.remove(coordinator)
        population.append(new_coordinator)
        explore(new_coordinator)

    for coordinator in population:
        coordinator.episode_rewards.clear()


def explore(coordinator):
    for agent in coordinator.agents:
        new_actor_learning_rate = round(agent.hyper_parameters['actor_learning_rate'] * random.uniform(0.8, 1.2), 6)
        new_critic_learning_rate = round(agent.hyper_parameters['critic_learning_rate'] * random.uniform(0.8, 1.2), 6)

        agent.actor_network.optimizer.learning_rate.assign(new_actor_learning_rate)
        agent.critic_network.optimizer.learning_rate.assign(new_critic_learning_rate)
        agent.hyper_parameters['actor_learning_rate'] = new_actor_learning_rate
        agent.hyper_parameters['critic_learning_rate'] = new_critic_learning_rate

# Main

In [None]:
environment_name = 'Simple Adevrsary'
log_file_name = 'PBT-5'
population = []
for i in range(16):
    population.append(Coordinator(environment_name,
                                  {'actor_learning_rate': round(random.uniform(0.00001, 0.001), 4),
                                   'critic_learning_rate': round(random.uniform(0.00001, 0.001), 4),
                                   'entropy_coefficient': 0.0001,
                                   'critic_coefficient': 0.3,
                                   'discount_factor': 0.95,
                                   'unroll_length': 5,
                                   'minions_num': 5},
                                  coordinator_id=i, log_file_name=log_file_name))
f = open(f'{environment_name}-{log_file_name}.csv', 'a')
f.write(
    f'Coordinator ID,Episode Number,Episode Reward,Agent ID,Agent Reward, Actor Learning Rate, Critic Learning Rate, '
    f'Entropy\n')
f.close()
j = 0

for j in range(1, 1501):
    for coordinator in population:
        try:
          coordinator.play(show_env=False)
        except Exception:
            new_coordinator_id = coordinator.id
            new_coordinator_episode = coordinator.episode_num
            population.remove(coordinator)
            new_coordinator = copy.deepcopy(
                random.choice(sorted(population, key=lambda i: np.mean(i.episode_rewards), reverse=True)[:3]))
            new_coordinator.id = new_coordinator_id
            new_coordinator.episode_num = new_coordinator_episode
            population.append(new_coordinator)

    if j % 100 == 0:
        for coordinator in population:
            print(f'{coordinator.id} --> {coordinator.episode_num} --> {np.mean(coordinator.episode_rewards)}')
        exploit(population)
        # files.download(f'{environment_name}-{log_file_name}.csv')

files.download(f'{environment_name}-{log_file_name}.csv')

2 --> 98 --> -182.92588807425722
4 --> 99 --> -141.74025349893844
5 --> 100 --> -121.07847523389326
8 --> 98 --> -125.64035183082399
14 --> 97 --> -126.34846558292618
15 --> 100 --> -246.05297029953147
7 --> 100 --> -132.98553960969187
12 --> 100 --> -128.4342438942282
1 --> 100 --> -177.99126402328747
13 --> 99 --> -111.69903586673288
11 --> 97 --> -114.88035969430422
10 --> 99 --> -111.30940036858692
6 --> 99 --> -113.57753837084731
3 --> 100 --> -119.10459578604016
9 --> 100 --> -113.47350985590235
0 --> 100 --> -111.30940036858692
Agent -> 10 will replace 1
Agent -> 10 will replace 2
Agent -> 10 will replace 15
14 --> 196 --> -145.42891437622495
7 --> 200 --> -76.15401225376102
10 --> 197 --> -100.80445619195041
6 --> 199 --> -99.69169473720277
15 --> 195 --> -87.09201408161682
3 --> 199 --> -140.8816089336075
2 --> 197 --> -99.82345809960222
4 --> 198 --> -93.65217741905327
11 --> 195 --> -84.97706990667635
13 --> 199 --> -79.09314642830387
9 --> 197 --> -77.52250222001575
1 --> 1

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


14 --> 685 --> -64.0121013388919
5 --> 690 --> -70.21724541868552
8 --> 691 --> -59.19948180167986
9 --> 685 --> -61.091401292854805
7 --> 696 --> -59.04332508844286
0 --> 689 --> -71.61506819603667
10 --> 685 --> -61.66937221023762
13 --> 690 --> -58.12622760272357
2 --> 687 --> -63.99825025065179
11 --> 687 --> -59.371066833868234
15 --> 680 --> -60.46968906735388
4 --> 692 --> -57.772226937033764
3 --> 687 --> -59.25835808702907
12 --> 688 --> -58.550021326884654
1 --> 693 --> -58.47620738358729
6 --> 690 --> -58.35727226713245
Agent -> 6 will replace 14
Agent -> 13 will replace 5
Agent -> 13 will replace 0
8 --> 791 --> -70.73529348172531
9 --> 785 --> -69.65668477566642
7 --> 796 --> -67.45903244897605
4 --> 790 --> -63.736323015069814
12 --> 787 --> -77.62547384410715
5 --> 716 --> -56.37647253878333
0 --> 744 --> -56.37647253878333
3 --> 734 --> -56.37647253878333
6 --> 715 --> -56.37647253878333
15 --> 757 --> -56.37647253878333
10 --> 711 --> -56.37647253878333
2 --> 744 --> -