# Shit needed to make this thing work

In [None]:
%%capture
!git clone https://github.com/koulanurag/ma-gym.git
%cd /content/ma-gym/
!pip install -e .
!pip3 install box2d-py

In [None]:
import tensorflow as tf
import os
import tensorflow_probability as tfp
import threading
import random
import numpy as np
import gym
import ma_gym
import copy
from tensorflow.keras import backend as K

# Actor Network

In [None]:
class ActorNetwork(tf.keras.Model):
    def __init__(self, output_dims, id):
        super(ActorNetwork, self).__init__()
        self.output_dims = output_dims
        # Create a checkpoint directory in case we want to save our model
        name = 'Actor'
        self.model_name = name + f' {id}'

        checkpoint_directory = f'{os.getcwd()}//Agent Models'
        self.checkpoint_dir = checkpoint_directory
        self.checkpoint_file = os.path.join(self.checkpoint_dir, self.model_name + '.h5')

        self.dense_layer_1 = tf.keras.layers.Dense(units=2048, activation='relu', name='Dense_Layer_1',
                                                   dtype=tf.float64)
        self.dense_layer_2 = tf.keras.layers.Dense(units=1024, activation='relu', name='Dense_Layer_2',
                                                   dtype=tf.float64)
        self.dense_layer_3 = tf.keras.layers.Dense(units=512, activation='relu', name='Dense_Layer_1',
                                                   dtype=tf.float64)
        self.action_probs = tf.keras.layers.Dense(units=self.output_dims, activation=None, name='Action_Logits',
                                                  dtype=tf.float64)

    def call(self, state):
        x = self.dense_layer_1(state)
        x = self.dense_layer_2(x)
        x = self.dense_layer_3(x)
        action_probs = self.action_probs(x)
        return action_probs

# Critic Network

In [None]:
class CriticNetwork(tf.keras.Model):
    def __init__(self, output_dims, id):
        super(CriticNetwork, self).__init__()
        self.output_dims = output_dims
        # Create a checkpoint directory in case we want to save our model
        name = 'Critic'
        self.model_name = name + f' {id}'

        checkpoint_directory = f'{os.getcwd()}//Agent Models'
        self.checkpoint_dir = checkpoint_directory
        self.checkpoint_file = os.path.join(self.checkpoint_dir, self.model_name + '.h5')

        self.dense_layer_1 = tf.keras.layers.Dense(units=2048, activation='relu', name='Dense_Layer_1',
                                                   dtype=tf.float64)
        self.dense_layer_2 = tf.keras.layers.Dense(units=1024, activation='relu', name='Dense_Layer_2',
                                                   dtype=tf.float64)
        self.dense_layer_3 = tf.keras.layers.Dense(units=512, activation='relu', name='Dense_Layer_1',
                                                   dtype=tf.float64)
        self.state_value = tf.keras.layers.Dense(units=1, activation=None, name='State_Value',
                                                 dtype=tf.float64)

    def call(self, state):
        x = self.dense_layer_1(state)
        x = self.dense_layer_2(x)
        x = self.dense_layer_3(x)
        state_value = self.state_value(x)
        return state_value

# Agent

In [None]:
class Agent:
    def __init__(self, output_dims, initial_hyper_parameters, id):
        # Agent's parameters needed for logging
        self.id = id
        self.cum_sum = 0
        self.episode_num = 0

        # Agent's initial hyper-parameters
        self.hyper_parameters = initial_hyper_parameters

        # These are the parameters we want to use with population based training
        self.actor_learning_rate = self.hyper_parameters['actor_learning_rate']
        self.critic_learning_rate = self.hyper_parameters['critic_learning_rate']

        # We're going to use one network for all of our minions
        self.actor_network = ActorNetwork(output_dims=output_dims, id=self.id)
        self.critic_network = CriticNetwork(output_dims=1, id=self.id)

        self.actor_network.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.actor_learning_rate))
        self.critic_network.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.critic_learning_rate))

        # Since Actor-Critic is an on-policy method, we will not use a replay buffer
        self.states = []
        self.actions = []
        self.rewards = []
        self.episode_rewards = []
        self.scores = []
        self.actor_losses = []
        self.critic_losses = []

    def save_models(self):
        # print('... saving models ...')
        self.actor_network.save_weights(self.actor_network.checkpoint_file)
        self.critic_network.save_weights(self.critic_network.checkpoint_file)

    def load_models(self):
        # print('... loading models ...')
        self.actor_network.load_weights(self.actor_network.checkpoint_file)
        self.critic_network.load_weights(self.critic_network.checkpoint_file)

    def choose_action(self, state):
        action_logits = self.actor_network(tf.convert_to_tensor([state]))
        action_probabilities = tf.nn.softmax(action_logits)
        action_distribution = tfp.distributions.Categorical(probs=action_probabilities, dtype=tf.float32)
        action = action_distribution.sample()

        return int(action.numpy()[0])

    def learn(self):
        discounted_rewards = []
        sum_reward = 0
        self.rewards.reverse()
        for r in self.rewards:
            sum_reward = r + self.hyper_parameters['discount_factor'] * sum_reward
            discounted_rewards.append(sum_reward)
        discounted_rewards.reverse()

        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            # Start calculating the Actor and Critic losses for each minion's experience
            action_logits = self.actor_network(tf.convert_to_tensor(self.states))
            state_values = self.critic_network(tf.convert_to_tensor(self.states))
            action_probabilities = tf.nn.softmax(action_logits)
            # We'll be using an advantage function
            action_distributions = tfp.distributions.Categorical(probs=action_probabilities, dtype=tf.float32)
            log_probs = action_distributions.log_prob(self.actions)
            advantage = tf.math.subtract(discounted_rewards, state_values)
            entropy = -1 * tf.math.reduce_sum(action_probabilities * tf.math.log(action_probabilities))
            actor_loss = tf.math.reduce_mean(-1 * log_probs * advantage) - self.hyper_parameters[
                'entropy_coefficient'] * entropy
            critic_loss = tf.math.reduce_mean(advantage ** 2)

            # Optimize master's network with the mean of all the losses
        actor_grads = tape1.gradient(actor_loss, self.actor_network.trainable_variables)
        critic_grads = tape2.gradient(critic_loss, self.critic_network.trainable_variables)
        self.actor_network.optimizer.apply_gradients(zip(actor_grads, self.actor_network.trainable_variables))
        self.critic_network.optimizer.apply_gradients(zip(critic_grads, self.critic_network.trainable_variables))
        self.actor_losses.append(actor_loss.numpy())
        self.critic_losses.append(critic_loss.numpy())

# Coordinator

In [None]:
class Coordinator:
    def __init__(self, environment_name, initial_hyper_parameters, coordinator_id, log_file_name):
        self.environment_name = environment_name
        self.id = coordinator_id
        self.log_file_name = log_file_name
        self.environment = gym.make(environment_name)
        self.observation = self.environment.reset()
        self.number_of_agents = len(self.environment.get_action_meanings())
        self.output_dims = len(self.environment.get_action_meanings()[0])
        self.observation_dims = self.environment.observation_space[0].shape[0]

        self.hyper_parameters = initial_hyper_parameters
        self.agents = [Agent(self.output_dims, self.hyper_parameters, id=i) for i in range(self.number_of_agents)]
        self.episode_finished = False
        self.episode_num=0
        self.mean_scores = 0
        self.episode_number = 0
        self.episode_rewards = []
        self.total_reward = 0
        self.episode_finished = False

    def play(self, show_env=False):
        self.episode_finished = False
        steps = 0

        while not self.episode_finished:
            actions = []

            for i, observation in enumerate(self.observation):
                self.agents[i].states.append(observation)
                temp_action = self.agents[i].choose_action(observation)
                self.agents[i].actions.append(temp_action)
                actions.append(temp_action)

            next_state, rewards, dones, info = self.environment.step(actions)
            
            for i, reward in enumerate(rewards):
                if reward == -1.01:
                    reward = 4.99
                    rewards[i] = 4.99
                if reward == 0.99:
                    reward = -5.01
                    rewards[i] = -5.01
                # if reward not in self.distinct_rewards:
                #     self.distinct_rewards.append(reward)
                self.agents[i].rewards.append(reward)
                self.total_reward += reward


            # if show_env:
            # self.environment.render()

            self.observation = next_state

            if dones == [True for i in range(len(self.agents))]:
                self.episode_finished = True
                self.episode_number += 1
                self.observation = self.environment.reset()
                self.episode_rewards.append(self.total_reward)
                self.mean_scores = np.mean(self.episode_rewards)
                for agent in self.agents:
                    f = open(f'{self.environment_name}-{self.log_file_name}.csv', 'a')
                    f.write(f'{self.id},{self.episode_number},{agent.id},{np.sum(agent.rewards)},{self.total_reward},{agent.hyper_parameters["actor_learning_rate"]},{agent.hyper_parameters["critic_learning_rate"]},{agent.hyper_parameters["entropy_coefficient"]}\n')
                    f.close()
                self.total_reward = 0
                self.episode_num+=1

            steps += 1

        for agent in self.agents:
            agent.learn()
            agent.states.clear()
            agent.rewards.clear()
            agent.actions.clear()

# PBT Functions

In [None]:
def exploit(population):
    sorted_population = sorted(population, key=lambda i: np.mean(i.episode_rewards), reverse=True)
    best_coordinators = sorted_population[:3]
    worst_coordinators = sorted_population[-3:]

    # for each other agent, load their models here
    for coordinator in worst_coordinators:
        worst_coordinator_id = coordinator.id
        worst_coordinator_episode = coordinator.episode_num
        new_coordinator = copy.deepcopy(random.choice(best_coordinators))
        print(f'Agent -> {new_coordinator.id} will replace {worst_coordinator_id}')
        new_coordinator.id = worst_coordinator_id
        new_coordinator.episode_num = worst_coordinator_episode
        population.remove(coordinator)
        population.append(new_coordinator)
        explore(new_coordinator)
    
    for coordinator in population:
        coordinator.episode_rewards.clear()


def explore(coordinator):
  for agent in coordinator.agents:
      new_actor_learning_rate = round(agent.hyper_parameters['actor_learning_rate'] * random.uniform(0.8, 1.2), 6)
      new_critic_learning_rate = round(agent.hyper_parameters['critic_learning_rate'] * random.uniform(0.8, 1.2), 6)
      # new_entropy_coefficient = round(agent.hyper_parameters['entropy_coefficient'] * random.uniform(0.8, 1.2), 0)

      # new_discount_factor = round(best_agent.hyper_parameters['discount_factor'] * random.uniform(0.8, 1.2), 2)
      # if new_discount_factor > 1:
      #     new_discount_factor = 1

      agent.actor_network.optimizer.learning_rate.assign(new_actor_learning_rate)
      agent.critic_network.optimizer.learning_rate.assign(new_critic_learning_rate)
      agent.hyper_parameters['actor_learning_rate'] = new_actor_learning_rate
      agent.hyper_parameters['critic_learning_rate'] = new_critic_learning_rate
      # agent.hyper_parameters['entropy_coefficient'] = new_entropy_coefficient
      # agent.hyper_parameters['discount_factor'] = new_discount_factor


# Main

In [None]:
environment_name = 'Checkers-v0'
log_file_name = 'PBT 2'
population=[]
for i in range(16):
  population.append( Coordinator(environment_name,
                            {'actor_learning_rate': round(random.uniform(0.00001,0.01),4),
                              'critic_learning_rate': round(random.uniform(0.00001,0.01),4),
                              'entropy_coefficient': 0.001,
                              'critic_coefficient': 0.3,
                              'discount_factor': 0.99,
                              'unroll_length': 5,
                              'minions_num': 5},
                            coordinator_id=i,log_file_name=log_file_name))
f = open(f'{environment_name}-{log_file_name}.csv', 'a')
f.write(f'Coordinator ID,Episode Number,Agent ID,Agent Reward,Episode Reward, Actor Learning Rate, Critic Learning Rate, Entropy\n')
f.close()
j=0

for j in range(1,2000):
    for coordinator in population:
      try:
        coordinator.play()
      except Exception:
        new_coordinator_id = coordinator.id
        new_coordinator_episode = coordinator.episode_num
        population.remove(coordinator)
        new_coordinator = copy.deepcopy(random.choice(sorted(population, key=lambda i: np.mean(i.episode_rewards), reverse=True)[:3]))
        new_coordinator.id = new_coordinator_id
        new_coordinator.episode_num = new_coordinator_episode
        population.append(new_coordinator)
  
    if j%100==0:
      for coordinator in population:
          print(f'{coordinator.id} --> {coordinator.episode_number} --> {np.mean(coordinator.episode_rewards)}')
      exploit(population)

In [None]:
from google.colab import files
files.download(f'/content/ma-gym/{environment_name}-{log_file_name}.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>