In [1]:
import numpy as np
import torch
import gymnasium as gym
from gymnasium import spaces

from agilerl.modules.cnn import EvolvableCNN
from agilerl.modules.multi_input import EvolvableMultiInput
from agilerl.modules.mlp import EvolvableMLP
from agilerl.hpo.mutation import Mutations, get_return_type

  from .autonotebook import tqdm as notebook_tqdm


#### Defining an EvolvableComposed

In [None]:
from gymnasium import spaces

# Create a sample Dict observation space with 3 image spaces and 2 vector spaces
observation_space = spaces.Dict({
    "image1": spaces.Box(low=0, high=255, shape=(3, 255, 255), dtype=np.uint8),
    "image2": spaces.Box(low=0, high=255, shape=(3, 255, 255), dtype=np.uint8),
    "image3": spaces.Box(low=0, high=255, shape=(3, 255, 255), dtype=np.uint8),
    "vector1": spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32),
    "vector2": spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32),
})

latent_dim = 10
num_outputs = 10
num_atoms = 51
channel_size = (8, 16, 32)
kernel_size = (3, 3, 3)
stride_size = (1, 1, 1)
hidden_size = [64, 64]
v_min = -10
v_max = 10
support = torch.linspace(v_min, v_max, num_atoms)
net = EvolvableComposed(observation_space, channel_size, kernel_size, stride_size, hidden_size, latent_dim, num_outputs, rainbow=True, num_atoms=num_atoms, support=support)
net

In [5]:
get_return_type(net.add_cnn_layer)

dict

#### Probe Environment

In [6]:
from typing import Dict, List, Optional, Tuple, Union

class SimpleMultiObsEnv(gym.Env):
    """
    Base class for GridWorld-based MultiObs Environments 4x4  grid world.

    .. code-block:: text

        ____________
       | 0  1  2   3|
       | 4|¯5¯¯6¯| 7|
       | 8|_9_10_|11|
       |12 13  14 15|
       ¯¯¯¯¯¯¯¯¯¯¯¯¯¯

    start is 0
    states 5, 6, 9, and 10 are blocked
    goal is 15
    actions are = [left, down, right, up]

    simple linear state env of 15 states but encoded with a vector and an image observation:
    each column is represented by a random vector and each row is
    represented by a random image, both sampled once at creation time.

    :param num_col: Number of columns in the grid
    :param num_row: Number of rows in the grid
    :param random_start: If true, agent starts in random position
    :param channel_last: If true, the image will be channel last, else it will be channel first
    """

    def __init__(
        self,
        num_col: int = 4,
        num_row: int = 4,
        random_start: bool = True,
        discrete_actions: bool = True,
        channel_last: bool = True,
    ):
        super().__init__()

        self.vector_size = 5
        if channel_last:
            self.img_size = [64, 64, 1]
        else:
            self.img_size = [1, 64, 64]

        self.random_start = random_start
        self.discrete_actions = discrete_actions
        if discrete_actions:
            self.action_space = spaces.Discrete(4)
        else:
            self.action_space = spaces.Box(0, 1, (4,))

        self.observation_space = spaces.Dict(
            spaces={
                "vec": spaces.Box(0, 1, (self.vector_size,), dtype=np.float64),
                "img": spaces.Box(0, 255, self.img_size, dtype=np.uint8),
            }
        )
        self.count = 0
        # Timeout
        self.max_count = 100
        self.log = ""
        self.state = 0
        self.action2str = ["left", "down", "right", "up"]
        self.init_possible_transitions()

        self.num_col = num_col
        self.state_mapping: List[Dict[str, np.ndarray]] = []
        self.init_state_mapping(num_col, num_row)

        self.max_state = len(self.state_mapping) - 1

    def init_state_mapping(self, num_col: int, num_row: int) -> None:
        """
        Initializes the state_mapping array which holds the observation values for each state

        :param num_col: Number of columns.
        :param num_row: Number of rows.
        """
        # Each column is represented by a random vector
        col_vecs = np.random.random((num_col, self.vector_size))
        # Each row is represented by a random image
        row_imgs = np.random.randint(0, 255, (num_row, 64, 64), dtype=np.uint8)

        for i in range(num_col):
            for j in range(num_row):
                self.state_mapping.append({"vec": col_vecs[i], "img": row_imgs[j].reshape(self.img_size)})

    def get_state_mapping(self) -> Dict[str, np.ndarray]:
        """
        Uses the state to get the observation mapping.

        :return: observation dict {'vec': ..., 'img': ...}
        """
        return self.state_mapping[self.state]

    def init_possible_transitions(self) -> None:
        """
        Initializes the transitions of the environment
        The environment exploits the cardinal directions of the grid by noting that
        they correspond to simple addition and subtraction from the cell id within the grid

        - up => means moving up a row => means subtracting the length of a column
        - down => means moving down a row => means adding the length of a column
        - left => means moving left by one => means subtracting 1
        - right => means moving right by one => means adding 1

        Thus one only needs to specify in which states each action is possible
        in order to define the transitions of the environment
        """
        self.left_possible = [1, 2, 3, 13, 14, 15]
        self.down_possible = [0, 4, 8, 3, 7, 11]
        self.right_possible = [0, 1, 2, 12, 13, 14]
        self.up_possible = [4, 8, 12, 7, 11, 15]

    def step(self, action: Union[int, np.ndarray]):
        """
        Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, terminated, truncated, info).

        :param action:
        :return: tuple (observation, reward, terminated, truncated, info).
        """
        if not self.discrete_actions:
            action = np.argmax(action)  # type: ignore[assignment]

        self.count += 1

        prev_state = self.state

        reward = -0.1
        # define state transition
        if self.state in self.left_possible and action == 0:  # left
            self.state -= 1
        elif self.state in self.down_possible and action == 1:  # down
            self.state += self.num_col
        elif self.state in self.right_possible and action == 2:  # right
            self.state += 1
        elif self.state in self.up_possible and action == 3:  # up
            self.state -= self.num_col

        got_to_end = self.state == self.max_state
        reward = 1 if got_to_end else reward
        truncated = self.count > self.max_count
        terminated = got_to_end

        self.log = f"Went {self.action2str[action]} in state {prev_state}, got to state {self.state}"

        return self.get_state_mapping(), reward, terminated, truncated, {"got_to_end": got_to_end}

    def render(self, mode: str = "human") -> None:
        """
        Prints the log of the environment.

        :param mode:
        """
        print(self.log)

    def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None) -> Tuple[Dict[str, np.ndarray], Dict]:
        """
        Resets the environment state and step count and returns reset observation.

        :param seed:
        :return: observation dict {'vec': ..., 'img': ...}
        """
        if seed is not None:
            super().reset(seed=seed)
        self.count = 0
        if not self.random_start:
            self.state = 0
        else:
            self.state = np.random.randint(0, self.max_state)
        return self.state_mapping[self.state], {}

In [None]:
env = SimpleMultiObsEnv()

obs, _ = env.reset()
obs

In [5]:
from agilerl.algorithms import MATD3

device = "cuda" if torch.cuda.is_available() else "cpu"
compile_mode = "default"
accelerator = None
matd3 = MATD3(
    observation_spaces=[spaces.Box(0, 1, shape=(3, 32, 32))],
    action_spaces=[spaces.Discrete(2)],
    one_hot=False,
    n_agents=1,
    agent_ids=["agent_0"],
    max_action=[(1,)],
    min_action=[(-1,)],
    discrete_actions=True,
    device=device,
    torch_compiler=compile_mode,
    accelerator=accelerator,
)

In [25]:
from typing import List
import fastrand

from agilerl.algorithms.base import RLAlgorithm
from agilerl.components.replay_buffer import ReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
from agilerl.modules.mlp import EvolvableMLP
from agilerl.training.train_off_policy import train_off_policy
from agilerl.utils.utils import (
    create_population,
    make_vect_envs,
    observation_space_channels_to_first,
    print_hyperparams
)

import yaml
import torch

with open("configs/training/ppo.yaml") as file:
    config = yaml.safe_load(file)

INIT_HP = config["INIT_HP"]
MUTATION_PARAMS = config["MUTATION_PARAMS"]
NET_CONFIG = config["NET_CONFIG"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("============ AgileRL ============")
print(f"DEVICE: {device}")

env = make_vect_envs(INIT_HP["ENV_NAME"], num_envs=INIT_HP["NUM_ENVS"])

observation_space = env.single_observation_space
action_space = env.single_action_space
if INIT_HP["CHANNELS_LAST"]:
    observation_space = observation_space_channels_to_first(observation_space)

tournament = TournamentSelection(
    INIT_HP["TOURN_SIZE"],
    INIT_HP["ELITISM"],
    INIT_HP["POP_SIZE"],
    INIT_HP["EVAL_LOOP"],
)

def init_objs(INIT_HP, MUTATION_PARAMS, device):
    mutations = Mutations(
        algo=INIT_HP["ALGO"],
        no_mutation=MUTATION_PARAMS["NO_MUT"],
        architecture=MUTATION_PARAMS["ARCH_MUT"],
        new_layer_prob=MUTATION_PARAMS["NEW_LAYER"],
        parameters=MUTATION_PARAMS["PARAMS_MUT"],
        activation=MUTATION_PARAMS["ACT_MUT"],
        rl_hp=MUTATION_PARAMS["RL_HP_MUT"],
        rl_hp_selection=MUTATION_PARAMS["RL_HP_SELECTION"],
        mutation_sd=MUTATION_PARAMS["MUT_SD"],
        min_lr=MUTATION_PARAMS["MIN_LR"],
        max_lr=MUTATION_PARAMS["MAX_LR"],
        min_batch_size=MUTATION_PARAMS["MAX_BATCH_SIZE"],
        max_batch_size=MUTATION_PARAMS["MAX_BATCH_SIZE"],
        min_learn_step=MUTATION_PARAMS["MIN_LEARN_STEP"],
        max_learn_step=MUTATION_PARAMS["MAX_LEARN_STEP"],
        arch=NET_CONFIG["arch"],
        rand_seed=MUTATION_PARAMS["RAND_SEED"],
        device=device,
    )

    pop = create_population(
        algo=INIT_HP["ALGO"],
        observation_space=observation_space,
        action_space=action_space,
        net_config=NET_CONFIG,
        INIT_HP=INIT_HP,
        actor_network=None,
        critic_network=None,
        population_size=INIT_HP["POP_SIZE"],
        num_envs=INIT_HP["NUM_ENVS"],
        device=device,
    )

    return pop, mutations

def get_mutation_choices(N: int, mutations: Mutations, pop: List[RLAlgorithm]) -> List[List[str]]:
    choices = []
    pop = mutations.mutation(pop, pre_training_mut=True)
    choices.append([ind.mut for ind in pop])

    for _ in range(N):
        pop = mutations.mutation(pop)
        choices.append([ind.mut for ind in pop])
    
    return choices

def set_seed(seed: int):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    fastrand.pcg32_seed(seed)

set_seed(42)

DEVICE: cuda


In [None]:
repr_choices = []

R = 4
for _ in range(R):
    set_seed(INIT_HP["RAND_SEED"])
    pop, mutations = init_objs(INIT_HP, MUTATION_PARAMS, device)
    repr_choices.append(get_mutation_choices(10, mutations, pop))
    print("\n")

Mutation choices: ['rl', 'param', 'rl', 'act']
Mutation choices: ['None', 'arch', 'None', 'arch']
Mutation choices: ['None', 'None', 'None', 'param']
Mutation choices: ['act', 'None', 'None', 'param']
Mutation choices: ['None', 'rl', 'None', 'None']
Mutation choices: ['None', 'rl', 'param', 'None']
Mutation choices: ['None', 'arch', 'param', 'arch']
Mutation choices: ['rl', 'rl', 'rl', 'None']
Mutation choices: ['None', 'param', 'act', 'None']
Mutation choices: ['rl', 'None', 'act', 'act']
Mutation choices: ['None', 'act', 'rl', 'act']


Mutation choices: ['rl', 'param', 'rl', 'act']
Mutation choices: ['None', 'arch', 'None', 'arch']
Mutation choices: ['None', 'None', 'None', 'param']
Mutation choices: ['act', 'None', 'rl', 'None']
Mutation choices: ['None', 'None', 'param', 'rl']
Mutation choices: ['rl', 'act', 'arch', 'act']
Mutation choices: ['None', 'act', 'act', 'None']
Mutation choices: ['param', 'param', 'rl', 'param']
Mutation choices: ['act', 'None', 'rl', 'arch']
Mutation cho

In [23]:
first_choice

[['lr', 'param', 'None', 'None'],
 ['None', 'arch', 'None', 'arch'],
 ['None', 'None', 'None', 'param'],
 ['None', 'None', 'bs', 'None'],
 ['bs', 'None', 'None', 'arch'],
 ['None', 'bs', 'lr', 'param'],
 ['None', 'None', 'None', 'bs'],
 ['param', 'None', 'None', 'arch'],
 ['lr', 'arch', 'None', 'arch'],
 ['param', 'arch', 'arch', 'bs'],
 ['None', 'None', 'None', 'lr']]

In [18]:
repr_choices[1]

[['lr', 'param', 'None', 'None'],
 ['None', 'arch', 'None', 'arch'],
 ['None', 'None', 'None', 'param'],
 ['None', 'None', 'None', 'param'],
 ['None', 'bs', 'None', 'None'],
 ['None', 'lr', 'param', 'None'],
 ['None', 'arch', 'param', 'arch'],
 ['lr', 'None', 'None', 'None'],
 ['None', 'param', 'None', 'None'],
 ['bs', 'None', 'None', 'None'],
 ['None', 'None', 'None', 'None']]

In [22]:
# ensure all choices are the same
first_choice = repr_choices[0]
for i in range(1, R):
    choice_N = repr_choices[i]
    for j, mut_choice in enumerate(choice_N):
        if not all(choice == first_choice[j][k] for k, choice in enumerate(mut_choice)):
            print(f"Choice {i}/{j} is different")
            print(first_choice[j])
            print(mut_choice)

Choice 1/3 is different
['None', 'None', 'bs', 'None']
['param', 'None', 'param', 'None']
Choice 1/4 is different
['bs', 'None', 'None', 'arch']
['None', 'param', 'arch', 'param']
Choice 1/5 is different
['None', 'bs', 'lr', 'param']
['param', 'None', 'None', 'None']
Choice 1/7 is different
['param', 'None', 'None', 'arch']
['None', 'bs', 'lr', 'None']
Choice 1/8 is different
['lr', 'arch', 'None', 'arch']
['param', 'bs', 'None', 'arch']
Choice 1/9 is different
['param', 'arch', 'arch', 'bs']
['None', 'param', 'param', 'None']
Choice 1/10 is different
['None', 'None', 'None', 'lr']
['bs', 'None', 'None', 'None']
Choice 3/3 is different
['None', 'None', 'bs', 'None']
['None', 'None', 'lr', 'None']
Choice 3/4 is different
['bs', 'None', 'None', 'arch']
['None', 'None', 'param', 'lr']
Choice 3/5 is different
['None', 'bs', 'lr', 'param']
['None', 'None', 'arch', 'None']
Choice 3/6 is different
['None', 'None', 'None', 'bs']
['None', 'None', 'None', 'None']
Choice 3/7 is different
['param'

In [5]:
pop = mutations.mutation(pop, pre_training_mut=True)
print([ind.mut for ind in pop])

N = 10
for _ in range(N):
    pop = mutations.mutation(pop)
    print([ind.mut for ind in pop])

Mutation choices: ['rl', 'param', 'rl', 'act']
['lr', 'param', 'None', 'None']
Mutation choices: ['None', 'arch', 'None', 'arch']
['None', 'arch', 'None', 'arch']
Mutation choices: ['None', 'None', 'None', 'param']
['None', 'None', 'None', 'param']
Mutation choices: ['act', 'None', 'rl', 'None']
['None', 'None', 'bs', 'None']
Mutation choices: ['rl', 'None', 'None', 'arch']
['bs', 'None', 'None', 'arch']
Mutation choices: ['act', 'rl', 'rl', 'param']
['None', 'bs', 'lr', 'param']
Mutation choices: ['act', 'act', 'arch', 'param']
['None', 'None', 'arch', 'param']
Mutation choices: ['rl', 'None', 'None', 'param']
['bs', 'None', 'None', 'param']
Mutation choices: ['arch', 'arch', 'param', 'None']
['arch', 'arch', 'param', 'None']
Mutation choices: ['arch', 'None', 'None', 'rl']
['arch', 'None', 'None', 'None']
Mutation choices: ['None', 'arch', 'rl', 'param']
['None', 'arch', 'bs', 'param']


In [19]:
def get_choices(mutations, population,  pre_training_mut: bool = False):
    mutation_options = mutations.pretraining_mut_options if pre_training_mut else mutations.mut_options
    mutation_proba = mutations.pretraining_mut_proba if pre_training_mut else mutations.mut_proba

    # Randomly choose mutation for each agent in population from options with
    # relative probabilities
    mutation_choice = mutations.rng.choice(
        mutation_options, len(population), p=mutation_proba
    )
    choice_mapping = {
        mutations.no_mutation: "None",
        mutations.architecture_mutate: "arch",
        mutations.parameter_mutation: "param",
        mutations.activation_mutation: "act",
        mutations.rl_hyperparam_mutation: "rl"
    }
    return [choice_mapping[choice] for choice in mutation_choice]

In [None]:
print(get_choices(mutations, pop, True))

N = 10
for i in range(N):
    print(get_choices(mutations, pop, False))

['rl', 'param', 'rl', 'act']
['None', 'rl', 'act', 'act']
['None', 'arch', 'arch', 'rl']
['param', 'act', 'arch', 'None']
['param', 'None', 'act', 'param']
['act', 'arch', 'rl', 'rl']
['act', 'None', 'arch', 'None']
['None', 'act', 'act', 'rl']
['None', 'arch', 'arch', 'None']
['None', 'arch', 'None', 'act']
['arch', 'act', 'act', 'None']


In [20]:
print(get_choices(mutations, pop, True))

N = 10
for i in range(N):
    print(get_choices(mutations, pop, False))

['rl', 'param', 'rl', 'act']
['None', 'rl', 'act', 'act']
['None', 'arch', 'arch', 'rl']
['param', 'act', 'arch', 'None']
['param', 'None', 'act', 'param']
['act', 'arch', 'rl', 'rl']
['act', 'None', 'arch', 'None']
['None', 'act', 'act', 'rl']
['None', 'arch', 'arch', 'None']
['None', 'arch', 'None', 'act']
['arch', 'act', 'act', 'None']


In [8]:
import torch.optim as optim

wrap = False
input_args = ind.inspect_attributes(input_args_only=True)
input_args["wrap"] = wrap
clone = type(ind)(**input_args)

for name, module in ind.evolvable_attributes().items():
    if isinstance(module, torch.nn.Module) and hasattr(module, "clone"):
        cloned_module = module.clone()
        setattr(clone, name, cloned_module)

# Clone all optimizers and update their parameters to point to cloned modules
for name, optimizer in ind.evolvable_attributes().items():
    if isinstance(optimizer, optim.Optimizer):
        print(optimizer.state_dict()["param_groups"][0]["params"])
        cloned_params = [
            getattr(clone, pname.split(".")[0]).parameters()
            for pname in optimizer.state_dict()["param_groups"][0]["params"]
        ]
        cloned_optimizer = optim.Adam(
            cloned_params, lr=optimizer.defaults.get("lr", 0.001)
        )
        cloned_optimizer.load_state_dict(optimizer.state_dict())
        setattr(clone, name, cloned_optimizer)

# Handle accelerator wrapping
if ind.accelerator is not None:
    models_to_wrap = [
        getattr(clone, name)
        for name, module in ind.evolvable_attributes().items()
        if isinstance(module, torch.nn.Module)
    ]
    optimizers_to_wrap = [
        getattr(clone, name)
        for name, optimizer in ind.evolvable_attributes().items()
        if isinstance(optimizer, optim.Optimizer)
    ]
    if wrap:
        wrapped = ind.accelerator.prepare(*models_to_wrap, *optimizers_to_wrap)
        for i, name in enumerate(
            [n for n, m in ind.evolvable_attributes().items() if isinstance(m, torch.nn.Module)]
            + [n for n, o in ind.evolvable_attributes().items() if isinstance(o, optim.Optimizer)]
        ):
            setattr(clone, name, wrapped[i])

[0, 1, 2, 3, 4, 5]


In [7]:
ind.evolvable_attributes()

{'actor': EvolvableMLP(
   (feature_net): Sequential(
     (mlp_linear_layer_0): Linear(in_features=4, out_features=64, bias=True)
     (mlp_activation_0): ReLU()
     (mlp_linear_layer_1): Linear(in_features=64, out_features=64, bias=True)
     (mlp_activation_1): ReLU()
     (mlp_linear_layer_output): Linear(in_features=64, out_features=2, bias=True)
   )
 ),
 'actor_target': EvolvableMLP(
   (feature_net): Sequential(
     (mlp_linear_layer_0): Linear(in_features=4, out_features=64, bias=True)
     (mlp_activation_0): ReLU()
     (mlp_linear_layer_1): Linear(in_features=64, out_features=64, bias=True)
     (mlp_activation_1): ReLU()
     (mlp_linear_layer_output): Linear(in_features=64, out_features=2, bias=True)
   )
 ),
 'optimizer': Adam (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 0.001
     maximize: False
     weight_decay: 0
 )}

In [15]:
# Profile step method
import cProfile

steps = 10000
profiler = cProfile.Profile()
profiler.enable()
obs = env.reset()
for _ in range(steps):
    action = env.action_space.sample()
    next_state, reward, done, trunc, info = env.step(action)

profiler.disable()
profiler.print_stats(sort="cumtime")

         6573424 function calls in 11.548 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        3    0.000    0.000   11.574    3.858 interactiveshell.py:3541(run_code)
        3    0.000    0.000   11.574    3.858 {built-in method builtins.exec}
    10000    0.019    0.000   11.453    0.001 vector_env.py:166(step)
    10000    0.484    0.000    5.887    0.001 async_vector_env.py:290(step_wait)
    10000    0.238    0.000    5.547    0.001 async_vector_env.py:265(step_async)
   160016    0.313    0.000    5.273    0.000 connection.py:202(send)
   160016    0.292    0.000    5.090    0.000 connection.py:246(recv)
   160016    0.227    0.000    3.726    0.000 connection.py:429(_recv_bytes)
   320032    0.519    0.000    3.431    0.000 connection.py:390(_recv)
   320032    2.788    0.000    2.788    0.000 {built-in method posix.read}
   160016    0.369    0.000    2.624    0.000 reduction.py:48(dumps)
   160016    0.243   

In [None]:
trained_pop, pop_fitnesses = train_off_policy(
    env,
    INIT_HP["ENV_NAME"],
    INIT_HP["ALGO"],
    agent_pop,
    memory=memory,
    INIT_HP=INIT_HP,
    MUT_P=MUTATION_PARAMS,
    swap_channels=INIT_HP["CHANNELS_LAST"],
    max_steps=INIT_HP["MAX_STEPS"],
    evo_steps=INIT_HP["EVO_STEPS"],
    eval_steps=INIT_HP["EVAL_STEPS"],
    eval_loop=INIT_HP["EVAL_LOOP"],
    learning_delay=INIT_HP["LEARNING_DELAY"],
    eps_start=INIT_HP["EPS_START"] if "EPS_START" in INIT_HP else 1.0,
    eps_end=INIT_HP["EPS_END"] if "EPS_END" in INIT_HP else 0.01,
    eps_decay=INIT_HP["EPS_DECAY"] if "EPS_DECAY" in INIT_HP else 0.999,
    target=INIT_HP["TARGET_SCORE"],
    tournament=tournament,
    mutation=mutations,
    wb=INIT_HP["WANDB"],
)

print_hyperparams(trained_pop)
# plot_population_score(trained_pop)

if str(device) == "cuda":
    torch.cuda.empty_cache()

env.close()

[<agilerl.algorithms.dqn.DQN at 0x7fe7ad5f8450>,
 <agilerl.algorithms.dqn.DQN at 0x7fe7af4e78d0>,
 <agilerl.algorithms.dqn.DQN at 0x7fe7ae9ea450>,
 <agilerl.algorithms.dqn.DQN at 0x7fe7ae9d6e90>]

#### Racecar Gym

In [None]:
from agilerl.modules.cnn import EvolvableCNN
from agilerl.algorithms.ppo import PPO
from accelerate import Accelerator

observation_space = spaces.Box(low=0, high=255, shape=(3, 32, 32), dtype=np.uint8)
action_space = spaces.Discrete(2)
one_hot = False
net_config_cnn = {
    "arch": "cnn",
    "hidden_size": [8],
    "channel_size": [3],
    "kernel_size": [3],
    "stride_size": [1],
    "normalize": False,
}
batch_size = 64
lr = 1e-4
gamma = 0.99
gae_lambda = 0.95
mut = None
action_std_init = 0.6
clip_coef = 0.2
ent_coef = 0.01
vf_coef = 0.5
max_grad_norm = 0.5
target_kl = None
update_epochs = 4
actor_network = None
critic_network = None
accelerator = Accelerator()
wrap = True

ppo = PPO(
    observation_space=observation_space,
    action_space=action_space,
    one_hot=one_hot,
    discrete_actions=True,
    net_config=net_config_cnn,
    batch_size=batch_size,
    lr=lr,
    gamma=gamma,
    gae_lambda=gae_lambda,
    mut=mut,
    action_std_init=action_std_init,
    clip_coef=clip_coef,
    ent_coef=ent_coef,
    vf_coef=vf_coef,
    max_grad_norm=max_grad_norm,
    target_kl=target_kl,
    update_epochs=update_epochs,
    actor_network=actor_network,
    critic_network=critic_network,
    accelerator=accelerator,
    wrap=wrap
)


In [4]:
ppo.optim_to_modules

{AcceleratedOptimizer (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 0.0001
     maximize: False
     weight_decay: 0
 
 Parameter Group 1
     amsgrad: False
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 0.0001
     maximize: False
     weight_decay: 0
 ): ['critic', 'actor']}

In [8]:
ppo.evolvable_attributes().keys()

dict_keys(['actor', 'critic', 'optimizer'])

In [4]:
ppo._identify_param_to_modules()

In [5]:
optimzer = ppo.optimizer

for param_group in optimzer.param_groups:
    for param in param_group["params"]:
        print(param._evol_module)

{'actor'}
{'actor'}
{'actor'}
{'actor'}
{'actor'}
{'actor'}
{'critic'}
{'critic'}
{'critic'}
{'critic'}
{'critic'}
{'critic'}


In [None]:
# On-policy
if swap_channels:
    state = np.moveaxis(state, [-1], [-3])

# Multi-agent
if swap_channels:
    if not is_vectorised:
        state = {
            agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
            for agent_id, s in state.items()
        }
    else:
        state = {
            agent_id: np.moveaxis(s, [-1], [-3])
            for agent_id, s in state.items()
        }

        

[2]