# Evolving a Lunar Lander with differentiable Genetic Programming

## Installation
To install the required libraries run the command:

In [1]:
#!pip install -r requirements.txt

## Imports
Imports from the standard genepro-multi library are done here. Any adjustments (e.g. different operators) should be made in the notebook. For example:

```
class SmoothOperator(Node):
  def __init__(self):
    super(SmoothOperator,self).__init__()
    self.arity = 1
    self.symb = "SmoothOperator"

  def _get_args_repr(self, args):
    return self._get_typical_repr(args,'before')

  def get_output(self, X):
    c_outs = self._get_child_outputs(X)
    return np.smoothOperation(c_outs[0])

  def get_output_pt(self, X):
    c_outs = self._get_child_outputs_pt(X)
    return torch.smoothOperation(c_outs[0])
```

In [2]:
import gymnasium as gym

from genepro.node_impl import *
from genepro.evo import Evolution
from genepro.node_impl import Constant

import torch
import torch.optim as optim

import random
import os
import copy
from collections import namedtuple, deque

import matplotlib.pyplot as plt
from matplotlib import animation

## Reinforcement Learning Setup
Here we first setup the Gymnasium environment. Please see https://gymnasium.farama.org/environments/box2d/lunar_lander/ for more information on the environment. 

Then a memory buffer is made. This is a buffer in which state transitions are stored. When the buffer reaches its maximum capacity old transitions are replaced by new ones.

A frame buffer is initialised used to later store animation frames of the environment.

In [3]:
env = gym.make("LunarLander-v2", render_mode="rgb_array")

In [4]:
Transition = namedtuple("Transition", ("state", "action", "next_state", "reward"))


class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

    def __iadd__(self, other):
        self.memory += other.memory
        return self

    def __add__(self, other):
        self.memory = self.memory + other.memory
        return self

## Fitness Function

Here you get to be creative. The default setup evaluates 5 episodes of 300 frames. Think of what action to pick and what fitness function to use. The Multi-tree takes an input of $n \times d$ where $n$ is a batch of size 1.

In [5]:


def fitness_function_pt(multitree, num_episodes=5, episode_duration=300, ignore_done=False, render=False):
    memory = ReplayMemory(10000)
    episode_rewards = []
    if render:
        frames = []

    # print(multitree.get_readable_repr())
    for _ in range(num_episodes):
        # get initial state of the environment
        observation = env.reset()
        observation = observation[0]
        rewards = []
        for _ in range(episode_duration):
            if render:
                frames.append(env.render())
            input_sample = torch.from_numpy(observation.reshape((1, -1))).float()
            action = torch.argmax(multitree.get_output_pt(input_sample)).detach()
            observation, reward, terminated, truncated, info = env.step(action.item())
            rewards.append(reward)
            output_sample = torch.from_numpy(observation.reshape((1, -1))).float()
            memory.push(input_sample, torch.tensor([[action.item()]]), output_sample, torch.tensor([reward]))
            if (terminated or truncated) and not ignore_done:
                break
        episode_rewards.append(np.sum(rewards))

    # Get the average reward over all episodes
    fitness = episode_rewards
    if render:
        return fitness, memory, frames
    return fitness, memory

def fitness_function(multitree, num_episodes=5, episode_duration=300, ignore_done=False, render=False):
    env = gym.make("LunarLander-v2", render_mode="rgb_array")


    episode_rewards = []
    if render:
        frames = []

    for _ in range(num_episodes):
        # get initial state of the environment
        observation = env.reset()
        observation = observation[0]
        rewards = []
        for _ in range(episode_duration):
            if render:
                frames.append(env.render())
            input_sample = observation.reshape((1, -1))
            action = np.argmax(multitree.get_output(input_sample))
            observation, reward, terminated, truncated, info = env.step(action.item())
            rewards.append(reward)
            if (terminated or truncated) and not ignore_done:
                break
        episode_rewards.append(np.sum(rewards))

    # Get the average reward over all episodes
    fitness = np.array(episode_rewards)
    if render:
        return fitness, [], frames
    return fitness, []



In [6]:
### USED TO STORE THE EXPERIMENT DICTIONARY
import inspect
import itertools
import pickle


def serialize_functions_in_dict(dictionary):
    for key, value in dictionary.items():
        if inspect.isfunction(value) or inspect.ismethod(value):
            dictionary[key] = value.__name__
        elif isinstance(value, list):
            for i, item in enumerate(value):
                if isinstance(item, dict):
                    value[i] = serialize_functions_in_dict(item)
                elif inspect.isfunction(item) or inspect.ismethod(item):
                    value[i] = item.__name__
                elif isinstance(item, Node):
                    value[i] = item.symb
        elif isinstance(value, dict):
            dictionary[key] = serialize_functions_in_dict(value)
    return dictionary

### USED TO CREATE THE EXPERIMENT DICTIONARY
def grid_search_params(params_dict):
    """
    Given a dictionary of hyperparameters, if a value is a list, loop over all values
    and create a grid search.
    """
    param_keys = params_dict.keys()
    param_values = params_dict.values()
    param_combinations = list(itertools.product(*[v if isinstance(v, list) else [v] for v in param_values]))
    for combination in param_combinations:
        yield dict(zip(param_keys, combination))

In [7]:
# Save the gen as a pickle file in the gens folder
def save_and_evaluate_evo_generations(evo, fitness_function, experiment_name, num_episodes=10, dir_name="experiments"):
    generation_evo_fitnesses = []
    generation_test_fitnesses = []
    for i, gen in enumerate(evo.best_of_gens):
        if i == 0:
            continue

        episode_rewards, _ = fitness_function(gen, num_episodes=num_episodes)
        evo_fitness_mean, evo_fitness_std = round(np.mean(gen.fitnesses), 3), round(np.std(gen.fitnesses), 3)
        test_fitness_mean, test_fitness_std  = round(np.mean(episode_rewards), 3), round(np.std(episode_rewards), 3)
        print(f"Best of Generation {i}: evo fitness:{evo_fitness_mean}+/-{evo_fitness_std} \t test_fitness:{test_fitness_mean}+/-{test_fitness_std}")
        
        generation_evo_fitnesses.append(gen.fitnesses)
        generation_test_fitnesses.append(episode_rewards)
        # create the gens folder if it doesn't exist
        os.makedirs(f"./{dir_name}/{experiment_name}/gen/", exist_ok=True) 
        with open(f"./{dir_name}/{experiment_name}/gen/gen_{i}_{evo_fitness_mean}_{test_fitness_mean}.pickle", "wb") as f:
            pickle.dump(gen, f)

    np.save(f"./{dir_name}/{experiment_name}/generation_evo_fitnesses.npy", generation_evo_fitnesses)
    np.save(f"./{dir_name}/{experiment_name}/generation_test_fitnesses.npy", generation_test_fitnesses)   
    return generation_evo_fitnesses, generation_test_fitnesses

def plot_evo_test_fitnesses(evo_fitnesses, test_fitnesses, experiment_name, dir_name="experiments"):
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.set_title(f"Fitnesses: {experiment_name}")
    ax.set_xlabel("Generation")
    ax.set_ylabel("Fitness")
    ax.plot(np.arange(len(evo_fitnesses)), [np.mean(gen) for gen in evo_fitnesses], label="evo_fitness", color='tab:blue')
    ax.fill_between(np.arange(len(evo_fitnesses)), [np.mean(gen) - np.std(gen) for gen in evo_fitnesses], [np.mean(gen) + np.std(gen) for gen in evo_fitnesses], alpha=0.2, color='tab:blue')
    ax.plot(np.arange(len(test_fitnesses)), [np.mean(gen) for gen in test_fitnesses], label="test_fitness", color='tab:orange')
    ax.fill_between(np.arange(len(test_fitnesses)), [np.mean(gen) - np.std(gen) for gen in test_fitnesses], [np.mean(gen) + np.std(gen) for gen in test_fitnesses], alpha=0.2, color='tab:orange')
    ax.legend()
    plt.savefig(f"./{dir_name}/{experiment_name}/{experiment_name}.png")
    plt.close()

## Evolution Setup
Here the leaf and internal nodes are defined. Think about the odds of sampling a constant in this default configurations. Also think about any operators that could be useful and add them here. 

Adjust the population size (multiple of 8 if you want to use the standard tournament selection), max generations and max tree size to taste. Be aware that each of these settings can increase the runtime.

#### BASELINE

In [14]:
from copy import deepcopy
import json
from genepro.selection import elitism_selection, tournament_selection, roulette_selection, rank_selection, boltzmann_selection
from genepro.variation import coeff_mutation, subtree_crossover, subtree_mutation

experiment_name = "elitism_selection"

dir_name = "selection_experiments"

num_features = env.observation_space.shape[0]
evo_settings = {
    "fitness_function": fitness_function,
    "internal_nodes": [[Plus(), Minus(), Times(), Div(), Sin(), Cos(), Log(), Sqrt(), Square(), Max(), Min()]],
    "leaf_nodes": [[Feature(i) for i in range(num_features)] + [Constant()]],
    "n_trees": 4,
    "pop_size": 32,
    "max_gens": 500,
    "init_max_depth": 4,
    "max_tree_size": 256,
    "crossovers": [[{"fun": subtree_crossover, "rate": 0.5}]],
    "mutations": [[{"fun": subtree_mutation, "rate": 0.5}]],
    "coeff_opts": [[{"fun": coeff_mutation, "rate": 0.5}]],
    # "selection": {"fun": tournament_selection, "kwargs": {"tournament_size": 4}},
    "selection": {"fun": roulette_selection},
    "n_jobs": 8,
    "verbose": True
}

def hpo_evolve(evo_settings, experiment_name):
    hpo_settings = list(grid_search_params(evo_settings))
    for settings in hpo_settings:
        serialized_dict = serialize_functions_in_dict(deepcopy(settings))
        print(serialized_dict)
        
    for i, settings in enumerate(hpo_settings):
        specific_experiment_name = experiment_name + f"_pops{settings['pop_size']}_gens{settings['max_gens']}_mts{settings['max_tree_size']}_cor{settings['crossovers'][0]['rate']}_mutr{settings['mutations'][0]['rate']}_coeffr{settings['coeff_opts'][0]['rate']}"
        os.makedirs(f"./{dir_name}/{specific_experiment_name}", exist_ok=True)
        with open(f"./{dir_name}/{specific_experiment_name}/evo_settings.json", "w") as f:
            serialized_dict = serialize_functions_in_dict(deepcopy(settings))
            json.dump(serialized_dict, f)

        evo_baseline = Evolution(**settings)
        evo_baseline.evolve()

        with open(f"./{dir_name}/{specific_experiment_name}/evolution_class.pickle", "wb") as f:
            pickle.dump(evo_baseline, f)
        
        generation_evo_fitnesses, generation_test_fitnesses = save_and_evaluate_evo_generations(evo_baseline, fitness_function, specific_experiment_name, num_episodes=5, dir_name=dir_name)
        plot_evo_test_fitnesses(generation_evo_fitnesses, generation_test_fitnesses, specific_experiment_name, dir_name=dir_name)
        
hpo_evolve(evo_settings, experiment_name=experiment_name)

{'fitness_function': 'fitness_function', 'internal_nodes': ['+', '-', '*', '/', 'sin', 'cos', 'log', 'sqrt', '**2', 'max', 'min'], 'leaf_nodes': ['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'const?'], 'n_trees': 4, 'pop_size': 32, 'max_gens': 500, 'init_max_depth': 4, 'max_tree_size': 256, 'crossovers': [{'fun': 'subtree_crossover', 'rate': 0.5}], 'mutations': [{'fun': 'subtree_mutation', 'rate': 0.5}], 'coeff_opts': [{'fun': 'coeff_mutation', 'rate': 0.5}], 'selection': {'fun': 'elitism_selection'}, 'n_jobs': 8, 'verbose': True}
gen: 1,	best of gen fitness: -98.591+/-71.847,	best of gen size: 12
gen: 2,	best of gen fitness: -98.501+/-55.786,	best of gen size: 15
gen: 3,	best of gen fitness: -105.869+/-57.754,	best of gen size: 12
gen: 4,	best of gen fitness: -92.125+/-66.089,	best of gen size: 14
gen: 5,	best of gen fitness: -103.874+/-60.821,	best of gen size: 31
gen: 6,	best of gen fitness: -105.135+/-57.642,	best of gen size: 31
gen: 7,	best of gen fitness: -105.960+/-6

  return np.square(c_outs[0])


gen: 145,	best of gen fitness: -15.512+/-70.038,	best of gen size: 236


  return np.square(c_outs[0])


gen: 146,	best of gen fitness: -22.623+/-72.348,	best of gen size: 146


  return np.square(c_outs[0])
  return np.square(c_outs[0])


gen: 147,	best of gen fitness: -24.460+/-25.410,	best of gen size: 243


  return np.square(c_outs[0])


gen: 148,	best of gen fitness: -30.309+/-12.332,	best of gen size: 145
gen: 149,	best of gen fitness: -20.014+/-11.021,	best of gen size: 143


  return np.square(c_outs[0])


gen: 150,	best of gen fitness: -19.057+/-24.136,	best of gen size: 143


  return np.square(c_outs[0])
  return np.square(c_outs[0])


gen: 151,	best of gen fitness: -31.358+/-22.467,	best of gen size: 120
gen: 152,	best of gen fitness: -12.282+/-84.460,	best of gen size: 138
gen: 153,	best of gen fitness: -25.538+/-8.517,	best of gen size: 120
gen: 154,	best of gen fitness: -13.006+/-16.029,	best of gen size: 137
gen: 155,	best of gen fitness: -29.025+/-12.815,	best of gen size: 125
gen: 156,	best of gen fitness: -25.590+/-30.233,	best of gen size: 250
gen: 157,	best of gen fitness: -34.211+/-13.511,	best of gen size: 142
gen: 158,	best of gen fitness: -28.902+/-24.595,	best of gen size: 204
gen: 159,	best of gen fitness: -26.905+/-14.230,	best of gen size: 255
gen: 160,	best of gen fitness: -27.220+/-15.360,	best of gen size: 194
gen: 161,	best of gen fitness: -28.827+/-13.621,	best of gen size: 177
gen: 162,	best of gen fitness: -38.047+/-11.036,	best of gen size: 130
gen: 163,	best of gen fitness: -21.095+/-20.809,	best of gen size: 139
gen: 164,	best of gen fitness: -31.456+/-15.854,	best of gen size: 172
gen: 16

  protected_div = sign_b * c_outs[0] / (1e-9 + np.abs(c_outs[1]))


gen: 197,	best of gen fitness: -29.614+/-10.913,	best of gen size: 191


  protected_div = sign_b * c_outs[0] / (1e-9 + np.abs(c_outs[1]))
  protected_div = sign_b * c_outs[0] / (1e-9 + np.abs(c_outs[1]))


gen: 198,	best of gen fitness: -31.587+/-17.973,	best of gen size: 181


  protected_div = sign_b * c_outs[0] / (1e-9 + np.abs(c_outs[1]))


gen: 199,	best of gen fitness: -22.904+/-22.451,	best of gen size: 191
gen: 200,	best of gen fitness: -32.739+/-16.053,	best of gen size: 181
gen: 201,	best of gen fitness: -8.317+/-61.185,	best of gen size: 213
gen: 202,	best of gen fitness: -22.318+/-20.211,	best of gen size: 181
gen: 203,	best of gen fitness: -32.603+/-19.760,	best of gen size: 134
gen: 204,	best of gen fitness: -28.884+/-27.031,	best of gen size: 145
gen: 205,	best of gen fitness: -22.102+/-70.950,	best of gen size: 247
gen: 206,	best of gen fitness: -12.968+/-70.027,	best of gen size: 166
gen: 207,	best of gen fitness: -28.062+/-23.514,	best of gen size: 218
gen: 208,	best of gen fitness: -23.481+/-15.036,	best of gen size: 210
gen: 209,	best of gen fitness: -24.213+/-11.458,	best of gen size: 228
gen: 210,	best of gen fitness: -19.752+/-22.275,	best of gen size: 215
gen: 211,	best of gen fitness: -29.687+/-17.112,	best of gen size: 132
gen: 212,	best of gen fitness: -31.787+/-18.315,	best of gen size: 197
gen: 21

  protected_div = sign_b * c_outs[0] / (1e-9 + np.abs(c_outs[1]))


gen: 216,	best of gen fitness: -26.597+/-14.114,	best of gen size: 181
gen: 217,	best of gen fitness: -35.451+/-24.852,	best of gen size: 228
gen: 218,	best of gen fitness: -21.986+/-14.550,	best of gen size: 133
gen: 219,	best of gen fitness: -29.988+/-23.511,	best of gen size: 148
gen: 220,	best of gen fitness: -24.900+/-22.980,	best of gen size: 167
gen: 221,	best of gen fitness: -29.998+/-20.425,	best of gen size: 228


  protected_div = sign_b * c_outs[0] / (1e-9 + np.abs(c_outs[1]))


gen: 222,	best of gen fitness: -33.008+/-16.848,	best of gen size: 167
gen: 223,	best of gen fitness: -24.454+/-29.574,	best of gen size: 201
gen: 224,	best of gen fitness: -22.459+/-11.345,	best of gen size: 184
gen: 225,	best of gen fitness: -31.740+/-23.697,	best of gen size: 165
gen: 226,	best of gen fitness: -29.644+/-27.560,	best of gen size: 173
gen: 227,	best of gen fitness: -14.706+/-55.196,	best of gen size: 184


  return np.multiply(c_outs[0], c_outs[1])


gen: 228,	best of gen fitness: -22.454+/-9.832,	best of gen size: 184
gen: 229,	best of gen fitness: -21.345+/-19.563,	best of gen size: 149
gen: 230,	best of gen fitness: -32.865+/-9.836,	best of gen size: 148
gen: 231,	best of gen fitness: -29.329+/-18.254,	best of gen size: 184
gen: 232,	best of gen fitness: -29.258+/-38.990,	best of gen size: 159
gen: 233,	best of gen fitness: -17.328+/-13.214,	best of gen size: 187
gen: 234,	best of gen fitness: -26.481+/-15.650,	best of gen size: 185
gen: 235,	best of gen fitness: -21.247+/-20.537,	best of gen size: 191
gen: 236,	best of gen fitness: -27.461+/-14.241,	best of gen size: 96
gen: 237,	best of gen fitness: -31.567+/-10.211,	best of gen size: 192
gen: 238,	best of gen fitness: -29.558+/-15.983,	best of gen size: 192
gen: 239,	best of gen fitness: -21.727+/-18.869,	best of gen size: 192
gen: 240,	best of gen fitness: -23.777+/-12.216,	best of gen size: 185
gen: 241,	best of gen fitness: -28.456+/-16.051,	best of gen size: 221
gen: 242,

  protected_div = sign_b * c_outs[0] / (1e-9 + np.abs(c_outs[1]))


gen: 249,	best of gen fitness: 4.489+/-110.079,	best of gen size: 228
gen: 250,	best of gen fitness: -15.827+/-35.251,	best of gen size: 228
gen: 251,	best of gen fitness: -7.798+/-70.965,	best of gen size: 228
gen: 252,	best of gen fitness: -25.674+/-17.747,	best of gen size: 227
gen: 253,	best of gen fitness: -23.542+/-15.428,	best of gen size: 171
gen: 254,	best of gen fitness: -31.582+/-4.963,	best of gen size: 254
gen: 255,	best of gen fitness: -27.026+/-33.241,	best of gen size: 254
gen: 256,	best of gen fitness: -11.890+/-11.230,	best of gen size: 131
gen: 257,	best of gen fitness: -29.448+/-17.210,	best of gen size: 184
gen: 258,	best of gen fitness: -32.326+/-20.421,	best of gen size: 226
gen: 259,	best of gen fitness: -29.168+/-17.975,	best of gen size: 131
gen: 260,	best of gen fitness: -23.189+/-16.964,	best of gen size: 105
gen: 261,	best of gen fitness: -28.423+/-7.166,	best of gen size: 184
gen: 262,	best of gen fitness: -0.105+/-70.773,	best of gen size: 243
gen: 263,	b



gen: 395,	best of gen fitness: -19.552+/-28.328,	best of gen size: 196
gen: 396,	best of gen fitness: -16.715+/-12.777,	best of gen size: 222
gen: 397,	best of gen fitness: -23.348+/-14.651,	best of gen size: 255
gen: 398,	best of gen fitness: -27.302+/-25.172,	best of gen size: 189
gen: 399,	best of gen fitness: -17.866+/-20.333,	best of gen size: 245
gen: 400,	best of gen fitness: -14.596+/-13.645,	best of gen size: 245
gen: 401,	best of gen fitness: -16.024+/-55.820,	best of gen size: 255
gen: 402,	best of gen fitness: -10.496+/-18.388,	best of gen size: 176
gen: 403,	best of gen fitness: -23.131+/-10.833,	best of gen size: 177
gen: 404,	best of gen fitness: -24.092+/-15.740,	best of gen size: 255
gen: 405,	best of gen fitness: -14.841+/-18.585,	best of gen size: 255
gen: 406,	best of gen fitness: -10.160+/-65.552,	best of gen size: 205
gen: 407,	best of gen fitness: -12.444+/-14.957,	best of gen size: 195
gen: 408,	best of gen fitness: -16.562+/-8.627,	best of gen size: 248
gen: 40

  return np.square(c_outs[0])


gen: 460,	best of gen fitness: -6.936+/-4.348,	best of gen size: 234
gen: 461,	best of gen fitness: -0.160+/-24.774,	best of gen size: 238
gen: 462,	best of gen fitness: -3.201+/-58.298,	best of gen size: 240
gen: 463,	best of gen fitness: 2.656+/-23.025,	best of gen size: 238
gen: 464,	best of gen fitness: -15.291+/-30.980,	best of gen size: 218
gen: 465,	best of gen fitness: -11.747+/-19.665,	best of gen size: 240
gen: 466,	best of gen fitness: -11.382+/-10.292,	best of gen size: 240
gen: 467,	best of gen fitness: -9.078+/-21.766,	best of gen size: 219
gen: 468,	best of gen fitness: -12.394+/-29.896,	best of gen size: 231
gen: 469,	best of gen fitness: -11.653+/-21.639,	best of gen size: 215
gen: 470,	best of gen fitness: -7.509+/-27.422,	best of gen size: 252
gen: 471,	best of gen fitness: 0.333+/-13.548,	best of gen size: 227
gen: 472,	best of gen fitness: -7.840+/-29.777,	best of gen size: 184
gen: 473,	best of gen fitness: -14.539+/-17.424,	best of gen size: 209
gen: 474,	best of

In [9]:
# import pickle

# dir_name = 'selection_experiments'
# specific_experiment_name = "new_elitist_pops64_gens50_mts32_cor0.5_mutr0.5_coeffr0.5"


# with open(f"./{dir_name}/{specific_experiment_name}/evolution_class.pickle", "rb") as f:
#     evo_baseline = pickle.load(f)

#     generation_evo_fitnesses, generation_test_fitnesses = save_and_evaluate_evo_generations(evo_baseline, fitness_function_pt, specific_experiment_name, num_episodes=5, dir_name=dir_name)
#     plot_evo_test_fitnesses(generation_evo_fitnesses, generation_test_fitnesses, specific_experiment_name, dir_name)

### Run multiple experiments and plot

In [10]:
# NUM_EXPERIMENTS = 2
# SAVE_DIR = "./plots/plot_data"
# experiment_name = "plot_test"

# evo_settings = {
#     "fitness_function": fitness_function_pt,
#     "internal_nodes": [Plus(), Minus(), Times(), Div()],
#     "leaf_nodes": [Feature(i) for i in range(num_features)] + [Constant()],
#     "n_trees": 4,
#     "pop_size": 8,
#     "max_gens": 10,
#     "init_max_depth": 4,
#     "max_tree_size": 32,
#     "crossovers": [{"fun": subtree_crossover, "rate": 0.5}],
#     "mutations": [{"fun": subtree_mutation, "rate": 0.5}],
#     "coeff_opts": [{"fun": coeff_mutation, "rate": 0.5}],
#     "selection": {"fun": tournament_selection, "kwargs": {"tournament_size": 8}},
#     "n_jobs": 8,
#     "verbose": True
# }

# os.makedirs(f"./experiments/{experiment_name}", exist_ok=True)
# with open(f"./experiments/{experiment_name}/evo_settings.json", "w") as f:
#     serialized_dict = serialize_functions_in_dict(deepcopy(evo_settings))
#     json.dump(serialized_dict, f)

# def run_multiple_evolutions(num_experiments, save_path):
#     fitnesses = []
    
#     for _ in range(num_experiments):
#         evo_baseline = Evolution(**evo_settings)
#         evo_baseline.evolve()
#         fitness = save_and_evaluate_evo_generations(evo_baseline, fitness_function_pt)
#         fitnesses.append(fitness)
    
#     np.save(save_path, fitnesses)

# save_path = f"{SAVE_DIR}/{experiment_name}.npy"

# run_multiple_evolutions(NUM_EXPERIMENTS, save_path)

## Evolve
Running this cell will use all the settings above as parameters

# Test

## Make an animation
Here the best evolved individual is selected and one episode is rendered. Make sure to save your lunar landers over time to track progress and make comparisons.

In [11]:
# # gist to save gif from https://gist.github.com/botforge/64cbb71780e6208172bbf03cd9293553
# def save_frames_as_gif(frames, path="./", filename="evolved_lander.gif"):
#     plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72)
#     patch = plt.imshow(frames[0])
#     plt.axis("off")

#     def animate(i):
#         patch.set_data(frames[i])

#     anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50)
#     anim.save(path + filename, writer="imagemagick", fps=60)


# frames = []
# avg_fitness, frames = get_test_score(evo.best_of_gens[-1], num_episodes=5, episode_duration=300, seed=5, render=True)
# print("Average fitness of the render is: ", avg_fitness)
# env.close()
# save_frames_as_gif(frames)

## Play animation

<img src="evolved_lander.gif" width="750">

## Optimisation
The coefficients in the multi-tree aren't optimised. Here Q-learning (taken from https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html) is used to optimise the weights further. Incorporate coefficient optimisation in training your agent(s). Coefficient Optimisation can be expensive. Think about how often you want to optimise, when, which individuals etc.

In [12]:
# batch_size = 128
# GAMMA = 0.99
# # best = evo.best_of_gens[-1]
# constants = best.get_subtrees_consts()

# if len(constants) > 0:
#     optimizer = optim.AdamW(constants, lr=1e-3, amsgrad=True)

# for _ in range(500):
#     if len(constants) > 0 and len(evo.memory) > batch_size:
#         target_tree = copy.deepcopy(best)

#         transitions = evo.memory.sample(batch_size)
#         batch = Transition(*zip(*transitions))

#         non_final_mask = torch.tensor(
#             tuple(map(lambda s: s is not None, batch.next_state)), dtype=torch.bool
#         )

#         non_final_next_states = torch.cat(
#             [s for s in batch.next_state if s is not None]
#         )
#         state_batch = torch.cat(batch.state)
#         action_batch = torch.cat(batch.action)
#         reward_batch = torch.cat(batch.reward)

#         state_action_values = best.get_output_pt(state_batch).gather(1, action_batch)
#         next_state_values = torch.zeros(batch_size, dtype=torch.float)
#         with torch.no_grad():
#             next_state_values[non_final_mask] = (
#                 target_tree.get_output_pt(non_final_next_states).max(1)[0].float()
#             )

#         expected_state_action_values = (next_state_values * GAMMA) + reward_batch

#         criterion = nn.SmoothL1Loss()
#         loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

#         # Optimize the model
#         optimizer.zero_grad()
#         loss.backward()
#         torch.nn.utils.clip_grad_value_(constants, 100)
#         optimizer.step()

# print(best.get_readable_repr())
# print(get_test_score(best))

In [13]:
# frames = []
# fitness_function_pt(
#     best, num_episodes=1, episode_duration=500, render=True, ignore_done=False
# )
# env.close()
# save_frames_as_gif(frames, filename="evolved_lander_RL.gif")

<img src="evolved_lander_RL.gif" width="750">