# Setup

In [None]:
!pip install torch numpy pettingzoo gymnasium pettingzoo[classic]
!pip install imageio[ffmpeg]

In [None]:
#Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math

import logging
import pettingzoo
from pettingzoo.classic import tictactoe_v3, connect_four_v3, texas_holdem_no_limit_v6
import imageio
from tqdm import tqdm
from collections import deque
from copy import deepcopy

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from IPython.display import clear_output

from abc import ABC, abstractmethod

from concurrent.futures import ProcessPoolExecutor, wait
import multiprocessing
import os

%matplotlib inline

In [None]:
logging.getLogger("pettingzoo").setLevel(logging.ERROR)
logging.getLogger("imageio_ffmpeg").setLevel(logging.ERROR)

In [None]:
!rm -rf ./videos/*.mp4
!rm -rf ./videos/*.txt

In [None]:
!mkdir ./videos

In [None]:
WHICH_TO_RUN = 'GMA' # GMA or EMS

In [None]:
config_GMA = {
    'env_type': 'tictactoe_v3', # tictactoe_v3 e connect_four_v3, texas_holdem_no_limit_v6
    'n_families': 4,
    'family_size': 25,
    'initial_std_dev': 0.1,
    'min_std_dev': 0.001,
    'std_dev_decay': 0.99, # TO REMOVE IT PUT IT TO 1
    'family_n_elites': 1,
    'n_generations': 1000,
    'gamma': 0.99,
    'neg_multiplier': 1.2,
    'use_softmax': False,
    'family_hof_size': 5,
    'use_action_mask': True,
    'plot_eval_freq': 1,
    'plot_eval_times': 50,
    'plot_eval_window': 20,
    'plot_path': './reward_plot_episode',
    'video_folder': "./videos",
    'parallelization_type': 'no',
    'network_type': 'ClassicNet'
}

In [None]:
config_EMS = {
    'env_type': 'tictactoe_v3', # tictactoe_v3 e connect_four_v3, texas_holdem_no_limit_v6
    'n_families': 4,
    'family_size': 25,
    'initial_std_dev': 0.09,
    'min_std_dev': 0.01,
    'std_dev_decay': 0.995, # TO REMOVE IT PUT IT TO 1
    'n_generations': 1000,
    'gamma': 0.99,
    'neg_multiplier': 1.2,
    'normalize_gradient': False,
    'family_hof_size': 5,
    'learning_rate': 1,
    'use_action_mask': True,
    'plot_eval_freq': 1,
    'plot_eval_times': 50,
    'plot_eval_window': 20,
    'use_softmax': False,
    'plot_path': './reward_plot_episode',
    'video_folder': "./videos",
    'parallelization_type': 'no',
    'network_type': 'ClassicNet'
}

# Utils

In [None]:
# cosine similarity between the vectors of weights of the models
def cosine_similarity(model1, model2):

    # Flatten the models' parameters into a single vector
    model1_weights = torch.cat([p.view(-1) for p in model1.parameters()])
    model2_weights = torch.cat([p.view(-1) for p in model2.parameters()])

    # Compute the cosine similarity
    cos_sim = F.cosine_similarity(model1_weights, model2_weights, dim=0)

    return cos_sim.item()

In [None]:
# pool of workers for multi-process training
class Pool:

    def __init__(self, max_workers):
        self.executor = ProcessPoolExecutor(max_workers=max_workers)
        self.futures = []


    def submit_task(self, func, *args, **kwargs):
        future = self.executor.submit(func, *args, **kwargs)
        self.futures.append(future)


    def collect_results(self):

        # Ensure all futures are completed
        wait(self.futures)

        # Collect results from the completed futures in the order they were submitted
        results = [future.result() for future in self.futures]

        # Reset futures list for future tasks
        self.futures = []
        return results


    def shutdown(self):
        self.executor.shutdown(wait=True)

# Plain Neural Network class

In [None]:
# a simple neural network adapted to the three explored environments
class ClassicNet(nn.Module):

    def __init__(self, input_shape = [3,3,2], n_actions = 9, bias = True):
        super(ClassicNet, self).__init__()
        if len(input_shape) == 1: # observation space is a vector
            self.fc1 = nn.Linear(input_shape[0], 512, bias=bias)
        else:
            self.fc1 = nn.Linear(2 * input_shape[0] * input_shape[1], 512, bias = bias)

        self.fc2 = nn.Linear(512, 32, bias=bias)
        self.fc3 = nn.Linear(32, n_actions, bias=True) #always false

        # trying with 64 and 32 instead of 128 and 64


    def forward(self, x):
        x = torch.flatten(x)
        x = torch.relu(self.fc1(x))  # Using tanh activation function
        x = torch.relu(self.fc2(x))  # Using tanh activation function
        x = self.fc3(x)
        return x

In [None]:
# a simple deeper network
class DeepNet(nn.Module):

    def __init__(self, input_shape = [3,3,2], n_actions = 9, bias = True):
        super(DeepNet, self).__init__()

        if len(input_shape) == 1: #observation space is a vector
          self.fc1 = nn.Linear(input_shape[0], 64, bias = bias)
        else:
          self.fc1 = nn.Linear(2 * input_shape[0]*input_shape[1], 64, bias = bias)

        self.fc2 = nn.Linear(64, 32, bias = bias)
        self.fc3 = nn.Linear(32, 32, bias = bias)
        self.fc4 = nn.Linear(32, 16, bias = bias)
        self.fc5 = nn.Linear(16, 16, bias = bias)
        self.fc6 = nn.Linear(16, n_actions, bias = True)


    def forward(self, x):
        x = torch.flatten(x)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = torch.relu(self.fc5(x))
        x = self.fc6(x)
        return x

# Agents

In [None]:
# agent using a neural network and working on classic environments of pettingzoo
class NeuroAgentClassic(nn.Module):

    def __init__(self, input_shape, n_actions, use_softmax, mode = 'training', network_type = ClassicNet):
        super(NeuroAgentClassic, self).__init__()
        assert mode in ['training', 'evaluating', 'deploying']
        self.input_shape = input_shape
        self.n_actions = n_actions
        self.model = network_type(input_shape, n_actions)
        self.use_softmax = use_softmax
        self.mode = mode

        # disable gradient for the model
        for param in self.model.parameters():
            param.requires_grad = False


    def save(self, filename):
        checkpoint = {
            'model_state_dict': self.state_dict(),
            'input_shape': self.input_shape,
            'n_actions': self.n_actions,
            'use_softmax': self.use_softmax,
            'mode': self.mode,
        }
        torch.save(checkpoint, filename)


    @classmethod
    def load(cls, filename):
        checkpoint = torch.load(filename, weights_only=False)
        instance = cls(
            input_shape=checkpoint['input_shape'],
            n_actions=checkpoint['n_actions'],
            use_softmax=checkpoint['use_softmax'],
            mode=checkpoint['mode']
        )
        instance.load_state_dict(checkpoint['model_state_dict'])
        return instance


    def get_perturbable_layers(self):
      return [m for m in self.model.modules() if isinstance(m, nn.Linear) or isinstance(m, type(nn.Conv2d))]


    def get_perturbable_weights(self):
        weights = []
        for layer in self.get_perturbable_layers():
            weights.append(layer.weight.data.cpu().numpy().flatten())
            if hasattr(layer, 'bias') and layer.bias is not None:
              weights.append(layer.bias.data.cpu().numpy().flatten())
        return np.concatenate(weights)


    def set_perturbable_weights(self, flat_weights):
        idx = 0
        for layer in self.get_perturbable_layers():
            weight_size = layer.weight.numel()
            layer.weight.data = torch.tensor(flat_weights[idx: idx + weight_size].reshape(layer.weight.shape))
            idx += weight_size
            if hasattr(layer, 'bias') and layer.bias is not None:
              bias_size = layer.bias.numel()
              layer.bias.data = torch.tensor(flat_weights[idx: idx + bias_size].reshape(layer.bias.shape))
              idx += bias_size


    # mutate the model's weights by adding a normally distribute noise
    def mutate(self, std_dev):

        # get weights to mutate
        perturbable_weights = self.get_perturbable_weights()

        # generate the noise
        noise = np.random.normal(loc=0.0, scale=std_dev, size=perturbable_weights.shape).astype(np.float32)

        weights = perturbable_weights + noise

        # apply the noise
        self.set_perturbable_weights(weights)

        return noise


    # choose best action or sample according to model's logits
    def choose_action(self, inputs, action_mask):

        self.model.eval()
        with torch.no_grad():

            # get action values
            logits = self.model(inputs).squeeze(0)
            masked_logits = logits.clone()
            masked_logits[action_mask == 0] = float('-inf')

            # get probabilities
            masked_probs = torch.nn.functional.softmax(masked_logits, dim=0)

            # choose action
            if self.mode == 'training' and self.use_softmax:
                chosen_action = torch.multinomial(masked_probs, 1).item()
            else:
                # mode = evaluating, mode = deploying and mode = training with not softmax
                chosen_action = torch.argmax(masked_probs).item()

            return chosen_action, logits, masked_logits, masked_probs


    # get number of parameters
    def size(self):
        num_params = sum(p.numel() for p in self.model.parameters())
        print("Number of parameters:", num_params)
        param_size_mb = num_params * 4 / (1024 ** 2)
        print(f"Model size: {param_size_mb:.2f} MB")
        return num_params

In [None]:
# simple agent choosing random actions working on classic environments of pettingzoo
# (used as a baseline for evaluations)
class DummyAgent(nn.Module):

    def __init__(self, n_actions):
        super(DummyAgent, self).__init__()
        self.n_actions = n_actions
        self.mode = 'evaluating'


    def choose_action(self, inputs, action_mask):
        valid_actions = np.where(action_mask == 1)[0]
        logits = torch.zeros(self.n_actions)
        masked_logits = torch.zeros(self.n_actions)
        masked_logits[action_mask == 0] = float('-inf')
        masked_probs = torch.ones(self.n_actions) / len(valid_actions)
        masked_probs[action_mask == 0] = 0
        return np.random.choice(valid_actions), logits, masked_logits, masked_probs

# MultiTrainer

In [None]:
# generic class implementing methods used by both approaches (enabled to work
# with many 'families' and to train using multiple parallel processes)
class MultiTrainer(ABC):

    def __init__(self, env_type, n_families, family_size, n_generations, gamma, neg_multiplier,
                 use_softmax, family_hof_size, initial_std_dev,
                 min_std_dev, std_dev_decay, plot_eval_times, plot_eval_freq,
                 plot_eval_window, use_action_mask, plot_path, video_folder, parallelization_type, network_type):

        assert network_type in ['ClassicNet', 'DeepNet']
        assert env_type in ['tictactoe_v3', 'connect_four_v3', 'texas_holdem_no_limit_v6']
        assert parallelization_type in ['family', 'hof', 'no']

        # training parameters
        self.gamma = gamma
        self.neg_multiplier = neg_multiplier
        self.use_softmax = use_softmax
        self.n_families = n_families
        self.family_size = family_size
        self.family_hof_size = family_hof_size
        self.n_generations = n_generations
        self.initial_std_dev = initial_std_dev
        self.min_std_dev = min_std_dev
        self.std_dev_decay = std_dev_decay
        self.use_action_mask = use_action_mask
        self.env_type = env_type
        self.parallelization_type = parallelization_type

        #choice of the network
        if network_type == 'ClassicNet':
          self.network_type = ClassicNet
        elif network_type == 'DeepNet':
          self.network_type = DeepNet

        # parameters depending on the environment
        self.render_mode = 'rgb_array' # rendering mode
        if self.env_type == 'tictactoe_v3':
            self.input_shape = [3,3,2]
            self.n_actions = 9
            self.players = ['player_1', 'player_2']
        elif self.env_type == 'connect_four_v3':
            self.input_shape = [6,7,2]
            self.n_actions = 7
            self.players = ['player_0', 'player_1']
        elif self.env_type == 'texas_holdem_no_limit_v6':
            self.input_shape = [54]
            self.n_actions = 5
            self.players = ['player_0', 'player_1']

        # plot parameters
        self.dummy = DummyAgent(self.n_actions)
        self.plot_eval_window = plot_eval_window
        self.plot_eval_times = plot_eval_times
        self.plot_eval_freq = plot_eval_freq
        self.plot_path = plot_path

        # video parameters
        self.video_folder = video_folder

        # initialize variables
        self.families_eval_rewards = [[] for i in range(self.n_families)]
        self.families_train_rewards = [[] for i in range(self.n_families)]
        self.families_mean_eval_rewards = [[] for i in range(self.n_families)]
        self.step_count = 0
        self.std_dev = self.initial_std_dev
        self.winner = None
        self.family_winners = [None for i in range(self.n_families)]


    def transform_obs(self, obs):
        if self.env_type == 'tictactoe_v3':
            obs = obs.permute(2, 0, 1).unsqueeze(0)
        elif self.env_type == 'connect_four_v3':
            obs = obs.permute(2, 0, 1).unsqueeze(0)
        elif self.env_type == 'texas_holdem_no_limit_v6':
            obs = obs.unsqueeze(0)
        return obs


    def initialize_env(self):
        if self.env_type == 'tictactoe_v3':
          env = tictactoe_v3.env(render_mode=self.render_mode)
        elif self.env_type == 'connect_four_v3':
          env = connect_four_v3.env(render_mode=self.render_mode)
        elif self.env_type == 'texas_holdem_no_limit_v6':
          env = texas_holdem_no_limit_v6.env(render_mode=self.render_mode)
        env.reset()
        return env


    # for parallel training at family level
    def evaluate_family(self, family):

        rewards = np.zeros(self.family_size)
        for j in range(self.family_size):

            for k in range(self.hof_size):
                hof_index = -1-k
                result = self.evaluate_agent(family[j], self.hof[hof_index], True)
                rewards[j] += result

        rewards /= self.hof_size
        return rewards


    # for parallel training at agent level
    def evaluate_against_hof(self, agent):

        reward = 0
        for k in range(self.hof_size):
            hof_index = -1-k
            result = self.evaluate_agent(agent, self.hof[hof_index], True)
            reward += result

        reward /= self.hof_size
        return reward


    def schedule_parallel_training(self, families_population):

      if self.parallelization_type == 'family':
        for i in range(self.n_families):
            WORKERS.submit_task(self.evaluate_family, families_population[i])
        families_rewards = WORKERS.collect_results()

      elif self.parallelization_type == 'hof':
        for i in range(self.n_families):
            for j in range(self.family_size):
                WORKERS.submit_task(self.evaluate_against_hof, families_population[i][j])
        families_rewards = WORKERS.collect_results()
        families_rewards = np.reshape(families_rewards, (self.n_families, self.family_size))

      elif self.parallelization_type == 'no':
        families_rewards = []
        for i in range(self.n_families):
            for j in range(self.family_size):
                reward = self.evaluate_against_hof(families_population[i][j])
                families_rewards.append(reward)
        families_rewards = np.reshape(families_rewards, (self.n_families, self.family_size))

      return families_rewards


    def play_game(self, agent1, agent2, save_video = False):

        env = self.initialize_env()

        if save_video:
            path = self.video_folder + "/epoch_"+str(self.step_count)
            frames = []
            self.start_log(path)

        total_rewards = [0, 0]
        agents = [agent1, agent2]
        steps = 0

        for player in env.agent_iter():  # AEC mode!

            observation, reward, termination, truncation, _ = env.last()
            done = termination or truncation
            steps += 1

            player_id = self.players.index(player)
            total_rewards[player_id] += reward

            if done:
                action = None

            else:

                mask = torch.tensor(observation["action_mask"], dtype=torch.uint8)
                obs = torch.tensor(observation['observation'], dtype=torch.float32)
                obs = self.transform_obs(obs)

                # an agent can do the wrong action in 'training'
                if not self.use_action_mask and agents[player_id].mode == 'training':
                    mask = torch.ones_like(mask)

                action, logits, mlogits, probs = agents[player_id].choose_action(obs, mask)

            env.step(action)

            # save the rendered frame for the video and write log
            if save_video:
                frame = env.render()
                frames.append(frame)
                self.write_log(path, agents[player_id], player, action, logits, mlogits, probs)

        env.close()

        if save_video:
            self.compose_video(path, frames)

        return total_rewards, steps-2


    def custom_reward(self, reward, steps):
        reward = reward * (self.gamma ** steps)
        if reward < 0:
            reward = reward * self.neg_multiplier
        return reward


    # evaluate the agent by playing against the evaluator
    def evaluate_agent(self, agent, evaluator, use_custom_reward, times = 1):

        total_reward = 0
        agent.mode = 'training'
        evaluator.mode = 'evaluating'

        for i in range(times):

            rewards, steps = self.play_game(evaluator, agent)
            reward = self.custom_reward(rewards[1], steps) if use_custom_reward else rewards[1]
            total_reward += reward

            rewards, steps = self.play_game(agent, evaluator)
            reward = self.custom_reward(rewards[0], steps) if use_custom_reward else rewards[0]
            total_reward += reward

        return total_reward / times


    def record_play(self, agent1, agent2):
        agent1.mode = 'deploying'
        agent2.mode = 'deploying'
        self.play_game(agent1, agent2, save_video = True)


    # make the human play against the agent
    def play_against(self, agent, start_first = True):

        env = self.initialize_env()

        agent.mode = 'deploying'

        path = './play_against'
        self.start_log(path)
        frames = []
        total_rewards = [0, 0]
        steps = 0
        user_position = 0 if start_first else 1

        for player in env.agent_iter():  # AEC mode!

            observation, reward, termination, truncation, _ = env.last()
            done = termination or truncation
            steps += 1

            player_id = self.players.index(player)
            total_rewards[player_id] += reward

            # visualize current state
            frame = env.render()
            clear_output(wait=True)
            fig, ax = plt.subplots(figsize=(6, 6))
            ax.imshow(frame)
            ax.axis('off')
            plt.show()

            if done:
                action = None

            else:

                mask = torch.tensor(observation["action_mask"], dtype=torch.uint8)
                obs = torch.tensor(observation['observation'], dtype=torch.float32)
                obs = self.transform_obs(obs)

                frame = env.render()
                frames.append(frame)

                # human's turn
                if user_position == player_id:

                    # choose action
                    print("Your Turn! Choose an action between 0 and", self.n_actions - 1)
                    print("Valid actions:", [i for i, valid in enumerate(mask) if valid])
                    while True:
                        try:
                            print("Enter your action: ")
                            action = int(input())
                            if action in [i for i, valid in enumerate(mask) if valid]:
                                break
                            else:
                                print("Invalid action. Please choose a valid action.")
                        except ValueError:
                            print("Invalid input. Please enter an integer.")

                # agent's turn
                else:

                    # choose action
                    action, logits, mlogits, probs = agent.choose_action(obs, mask)
                    self.write_log(path, agent, player, action, logits, mlogits, probs)

            env.step(action)

        # visualize current state
        frame = env.render()
        clear_output(wait=True)
        fig, ax = plt.subplots(figsize=(6, 6))
        ax.imshow(frame)
        ax.axis('off')
        plt.show()

        env.close()

        print("Game over!")
        print("You won!" if total_rewards[user_position] > total_rewards[1-user_position] else "You lose!")

        self.compose_video(path, frames)

        return total_rewards, steps-2


    # make the agent evaluate each configuration of a game
    def evaluate_with_agent(self, agent):

        env = self.initialize_env()

        agent.mode = 'deploying'

        path = './evaluate_with_agent'
        self.start_log(path)
        frames = []

        for player in env.agent_iter():  # AEC mode!

            observation, _, termination, truncation, _ = env.last()
            done = termination or truncation

            # visualize current state
            frame = env.render()
            clear_output(wait=True)
            fig, ax = plt.subplots(figsize=(6, 6))
            ax.imshow(frame)
            ax.axis('off')
            plt.show()

            if done:
                action = None

            else:

                mask = torch.tensor(observation["action_mask"], dtype=torch.uint8)
                obs = torch.tensor(observation['observation'], dtype=torch.float32)
                obs = self.transform_obs(obs)

                frame = env.render()
                frames.append(frame)

                # agent evaluation
                action, logits, mlogits, probs = agent.choose_action(obs, mask)
                self.write_log(path, agent, player, action, logits, mlogits, probs)

                # human chooses action
                print("Your Turn! Choose an action between 0 and", self.n_actions - 1)
                print("Valid actions:", [i for i, valid in enumerate(mask) if valid])
                while True:
                    try:
                        print("Enter your action: ")
                        action = int(input())
                        if action in [i for i, valid in enumerate(mask) if valid]:
                            break
                        else:
                            print("Invalid action. Please choose a valid action.")
                    except ValueError:
                        print("Invalid input. Please enter an integer.")

            env.step(action)

        # visualize current state
        frame = env.render()
        clear_output(wait=True)
        fig, ax = plt.subplots(figsize=(6, 6))
        ax.imshow(frame)
        ax.axis('off')
        plt.show()

        env.close()

        print("Game over!")

        self.compose_video(path, frames)


    @abstractmethod
    def train_step(self):
        pass

    @abstractmethod
    def initialize_train(self):
        pass


    def train(self):

        # initialize variables
        self.families_eval_rewards = [[] for i in range(self.n_families)]
        self.families_train_rewards = [[] for i in range(self.n_families)]
        self.families_mean_eval_rewards = [[] for i in range(self.n_families)]
        self.step_count = 0
        self.std_dev = self.initial_std_dev
        self.initialize_train()

        # training loop
        for t in tqdm(range(self.n_generations)):

            # train
            self.train_step()

            # evaluation
            rewards  = []
            for i in range(self.n_families):
                reward = self.evaluate_agent(self.family_winners[i], self.dummy, False, self.plot_eval_times)
                rewards.append(reward)
            self.update_metrics(rewards)

            if self.step_count % self.plot_eval_freq == 0 and self.step_count != 0:
                self.plot_rewards()
                self.plot_collected_rewards()

            self.step_count+=1
            self.std_dev = max(self.min_std_dev, self.std_dev * self.std_dev_decay)


    def update_metrics(self, rewards):
        for i in range(self.n_families):
            self.families_eval_rewards[i].append(rewards[i])
            n_elems = min(len(self.families_eval_rewards[i]), self.plot_eval_window)
            reward_window = self.families_eval_rewards[i][-n_elems:]
            self.families_mean_eval_rewards[i].append(np.mean(reward_window))


    def plot_rewards(self):
        # Ensure the colors are defined based on the number of families
        colors = cm.tab10.colors if self.n_families <= 10 else cm.get_cmap('tab20', self.n_families).colors

        # Create individual plots for each family
        for i, (eval_rewards, train_rewards, mean_eval_rewards) in enumerate(zip(self.families_eval_rewards, self.families_train_rewards, self.families_mean_eval_rewards)):
            plt.figure()
            plt.clf()
            plt.title(f'Family {i+1} Rewards - {self.step_count} steps')
            plt.xlabel('Episode')
            plt.ylabel('Reward')

            color = colors[i % len(colors)]
            plt.plot(eval_rewards, label=f'Episode Reward (current)', color=color, linestyle='-')
            plt.plot(train_rewards, label=f" Train HOF Episodes Reward (current)", color=color, linestyle=':')
            plt.plot(mean_eval_rewards, label=f"{self.plot_eval_window}-Episodes Reward (window)", color=color, linestyle='--')

            plt.legend()
            plt.grid()

            # Save the individual plot for this family
            plt.savefig(f"{self.plot_path}_family_{i+1}.png", bbox_inches="tight")
            plt.close()  # Close the figure to free memory


        # Create a sigle plot with individual subplots for each family
        num_rows = 2
        num_cols = math.ceil(self.n_families / num_rows)

        fig, axes = plt.subplots(num_rows, num_cols, figsize=(7*num_cols, 10))
        axes = axes.flatten()

        for i, (eval_rewards, train_rewards, mean_eval_rewards) in enumerate(zip(self.families_eval_rewards, self.families_train_rewards, self.families_mean_eval_rewards)):
            ax = axes[i]
            ax.set_title(f'Family {i+1} Rewards - {self.step_count} steps')
            ax.set_xlabel('Episode')
            ax.set_ylabel('Reward')

            color = colors[i % len(colors)]
            ax.plot(eval_rewards, label=f'Episode Reward (current)', color=color, linestyle='-')
            ax.plot(train_rewards, label=f"Train HOF Episodes Reward (current)", color=color, linestyle=':')
            ax.plot(mean_eval_rewards, label=f"{self.plot_eval_window}-Episodes Reward (window)", color=color, linestyle='--')

            ax.legend()
            ax.grid()

        for j in range(i + 1, len(axes)):
            fig.delaxes(axes[j])

        plt.tight_layout()
        plt.savefig(f"{self.plot_path}_subplots.png", bbox_inches="tight")
        plt.close()

        # Create a combined plot for all average rewards
        plt.figure()
        plt.clf()
        plt.title(f'All Families Average Rewards (window {self.plot_eval_window}) - {self.step_count} steps')
        plt.xlabel('Episode')
        plt.ylabel('Reward')

        for i, mean_eval_rewards in enumerate(self.families_mean_eval_rewards):
            color = colors[i % len(colors)]
            plt.plot(mean_eval_rewards, label=f"Family {i+1}", color=color, linestyle='--')

        plt.legend()
        plt.grid()

        # Save the combined plot
        plt.savefig(f"{self.plot_path}_all_families.png", bbox_inches="tight")
        plt.close()  # Close the figure to free memory


    def plot_collected_rewards(self):
        plt.figure(1)
        plt.clf()
        plt.title(f'{self.step_count} steps - Average Rewards')
        plt.xlabel('Episode')
        plt.ylabel('Reward')

        avg_eval_rewards = np.mean(np.array(self.families_eval_rewards), axis=0)
        avg_mean_eval_rewards = np.mean(np.array(self.families_mean_eval_rewards), axis=0)

        max_eval_rewards = np.max(np.array(self.families_eval_rewards), axis=0)
        max_mean_eval_rewards = np.max(np.array(self.families_mean_eval_rewards), axis=0)

        plt.plot(avg_eval_rewards, label='Average Episode Reward', color='blue', linestyle='-')
        plt.plot(avg_mean_eval_rewards, color='blue', linestyle='--')
        plt.plot(max_eval_rewards, label='Max Episode Reward', color='red', linestyle='-')
        plt.plot(max_mean_eval_rewards, color='red', linestyle='--')

        plt.legend()
        plt.grid()

        avg_plot_path = self.plot_path + '_collected.png'
        plt.savefig(avg_plot_path, bbox_inches="tight")
        plt.close()  # Close the figure to free memory


    def start_log(self, path):
        path = path + '.txt'
        with open(path, 'w') as file:
            file.write('Starting game:\n\n')


    def write_log(self, path, agent, player, action, logits, mlogits, probs):

        path = path + '.txt'
        last_layer = list(agent.model.children())[-1]
        has_bias = hasattr(last_layer, 'bias') and last_layer.bias is not None

        with open(path, 'a') as file:
            file.write("-"*91 + "\n")
            file.write(f"{'Agent':<10} {player}\n")
            file.write(f"{'Action':<10} {action}\n")
            file.write("-"*91 + "\n")
            file.write(f"{' ':<10} {' '.join(f'{val:>8}' for val in range(self.n_actions))}\n")
            file.write(f"{'MLogits':<10} {' '.join(f'{val:>8.4f}' for val in mlogits.flatten())}\n")
            file.write(f"{'Logits':<10} {' '.join(f'{val:>8.4f}' for val in logits.flatten())}\n")
            if has_bias:
                file.write(f"{'Bias':<10} {' '.join(f'{val:>8.4f}' for val in last_layer.bias.flatten())}\n")
                differences = logits.flatten() - last_layer.bias.flatten()
                file.write(f"{'Diff':<10} {' '.join(f'{val:>8.4f}' for val in differences)}\n")
            file.write(f"{'Probs':<10} {' '.join(f'{val:>8.4f}' for val in probs.flatten())}\n")
            file.write("-"*91 + "\n\n\n")


    def compose_video(self, path, frames):
        path = path + '.mp4'
        with imageio.get_writer(path, fps=15) as writer:
            for frame in frames:
                for i in range(15):
                    writer.append_data(np.array(frame))


    def save_winner(self, filename):
        self.winner.save(filename)
        return True


    def save_winners(self, filename):
        for i in range(self.n_families):
            name = filename.replace('.pt', f'{i}.pt')
            self.family_winners[i].save(name)
        return True

# Genetic MultiTrainer

In [None]:
# specific trainer for genetic algorithm approach
class GeneticMultiTrainer(MultiTrainer):

    def __init__(self, env_type, n_families, family_size, n_generations, gamma, neg_multiplier,
                 use_softmax, family_hof_size, family_n_elites,
                 initial_std_dev, min_std_dev, std_dev_decay, plot_eval_times,
                 plot_eval_freq, plot_eval_window, use_action_mask, plot_path, video_folder, parallelization_type, network_type):

        super().__init__(env_type, n_families, family_size, n_generations, gamma, neg_multiplier,
                         use_softmax, family_hof_size, initial_std_dev,
                         min_std_dev, std_dev_decay, plot_eval_times, plot_eval_freq,
                         plot_eval_window, use_action_mask, plot_path, video_folder, parallelization_type, network_type)

        assert family_n_elites < family_size


        self.family_n_elites = family_n_elites
        self.hof_size = self.family_hof_size * self.n_families


    def initialize_train(self):

        # randomly initialized elites (they are initialized with training but each time they are used we set the mode)
        self.elites = []
        for i in range(self.n_families):
            family_elites = [NeuroAgentClassic(self.input_shape, self.n_actions, self.use_softmax, network_type = self.network_type) for _ in range(self.family_n_elites)]
            self.elites.append(family_elites)

        # hall of fame initially filled with the elites
        self.hof = deque([], maxlen = self.hof_size)
        j = 0
        while(len(self.hof) < self.hof_size):
            for i in range(self.n_families):
                model = deepcopy(self.elites[i][j % self.family_n_elites])
                self.hof.append(model)
            j += 1

        # winner is the last from the hall of fame
        self.winner = self.hof[-1]
        self.family_winners = []
        for i in range(self.n_families):
            hof_index = -1-i
            self.family_winners.append(self.hof[hof_index])


    def train_step(self):

        # the first members of the population are the elites
        families_population = []
        for i in range(self.n_families):
            family_population = self.elites[i][:self.family_n_elites]
            families_population.append(family_population)

        # the others are mutations of the elites
        for i in range(self.n_families):
            for j in range(self.family_size - self.family_n_elites):
                father_id = j % self.family_n_elites
                agent = deepcopy(self.elites[i][father_id])
                agent.mutate(self.std_dev)
                families_population[i].append(agent)

        # compute score for each member of the population
        families_rewards = self.schedule_parallel_training(families_population=families_population)

        # update winner and hall of fame
        best_rewards = np.zeros(self.n_families)
        for i in range(self.n_families):
            best_id = np.argmax(families_rewards[i])
            best_rewards[i] = np.max(families_rewards[i])
            self.family_winners[i] = deepcopy(families_population[i][best_id])
            self.families_train_rewards[i].append(best_rewards[i])
            self.hof.append(self.family_winners[i])

        best_family = np.argmax(best_rewards)
        self.winner = self.family_winners[best_family]
        self.record_play(self.winner, self.winner)

        # update the elites
        for i in range(self.n_families):
            new_elite_ids = np.argsort(families_rewards[i])[-self.family_n_elites:]
            for j in range(self.family_n_elites):
                id = new_elite_ids[j]
                self.elites[i][j] = families_population[i][id]

In [None]:
if WHICH_TO_RUN == 'GMA':
    trainer = GeneticMultiTrainer(**config_GMA)
    if config_GMA['parallelization_type'] != 'no':
        WORKERS = Pool(max_workers = os.cpu_count())
    trainer.train()
    trainer.save_winners("./winner.pt")
    trainer.save_winner("./best.pt")

# Evolutionary MultiTrainer

In [None]:
# specific trainer for evolutionary strategy approach
class EvolutionMultiTrainer(MultiTrainer):

    def __init__(self, env_type, n_families, family_size, n_generations, gamma, neg_multiplier, normalize_gradient,
                 use_softmax, family_hof_size, initial_std_dev, min_std_dev,
                 std_dev_decay, learning_rate, plot_eval_times, plot_eval_freq,
                 plot_eval_window, use_action_mask, plot_path, video_folder, parallelization_type, network_type):

        super().__init__(env_type, n_families, family_size, n_generations, gamma, neg_multiplier,
                         use_softmax, family_hof_size, initial_std_dev,
                         min_std_dev, std_dev_decay, plot_eval_times, plot_eval_freq,
                         plot_eval_window, use_action_mask, plot_path, video_folder, parallelization_type, network_type)

        self.learning_rate = learning_rate
        self.hof_size = self.n_families * self.family_hof_size
        self.normalize_gradient = normalize_gradient


    def initialize_train(self):

        # randomly initialized starting agent (initialized with training but each time they are used we set the mode)
        self.family_winners = []
        for i in range(self.n_families):
            self.family_winners.append(NeuroAgentClassic(self.input_shape, self.n_actions, self.use_softmax, network_type = self.network_type))

        self.winner = self.family_winners[0]

        # hall of fame filled with copies of the starting agent
        self.hof = deque([], maxlen = self.hof_size)
        while(len(self.hof) < self.hof_size):
            for i in range(self.n_families):
                model = deepcopy(self.family_winners[i])
                self.hof.append(model)


    def train_step(self):

        families_population = []
        families_noises = []

        for i in range(self.n_families):

            family_population = []
            family_noises = []

            # populate with mutations of the current agent
            for j in range(self.family_size):
                agent = deepcopy(self.family_winners[i])
                noise = agent.mutate(self.std_dev)
                family_population.append(agent)
                family_noises.append(noise)

            families_population.append(family_population)
            families_noises.append(family_noises)


        # compute score for each member of the population
        families_rewards = self.schedule_parallel_training(families_population=families_population)

        mean_family_rewards = np.array([np.mean(i) for i in families_rewards])
        for i in range(self.n_families):
            self.families_train_rewards[i].append(mean_family_rewards[i])
            if self.normalize_gradient: # TRY TO NORMALIZE
              families_rewards[i] -= mean_family_rewards[i]

        # compute gradients
        gradients = []
        for i in range(self.n_families):
            gradient = np.zeros_like(families_noises[i][0])
            for j in range(self.family_size):
                # if not self.normalize_gradient or families_rewards[i][j] > 0:
                gradient += families_noises[i][j] * families_rewards[i][j]
            gradient *= self.learning_rate / (self.family_size * self.std_dev)
            gradients.append(gradient)

        # update weights
        for i in range(self.n_families):
            self.family_winners[i] = deepcopy(self.family_winners[i])
            new_weights = self.family_winners[i].get_perturbable_weights() + gradients[i]
            self.family_winners[i].set_perturbable_weights(new_weights)
            self.hof.append(self.family_winners[i])

        # ESTIMATE THE BEST FAMILY
        best_family = np.argmax(mean_family_rewards)
        self.winner = self.family_winners[best_family]
        self.record_play(self.winner, self.winner)

In [None]:
if WHICH_TO_RUN == 'EMS':
    trainer = EvolutionMultiTrainer(**config_EMS)
    if config_EMS['parallelization_type'] != 'no':
        WORKERS = Pool(max_workers = os.cpu_count())
    trainer.train()
    trainer.save_winners("./winner.pt")
    trainer.save_winner("./best.pt")