# Finding short superpermutations for n=5

In [1]:
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from GymPermutationsEnv import GymPermutationEnv

In [2]:
import permutation_utils

alphabet_size = 5

max_reward_unscaled = permutation_utils.get_max_possible_reward(5, 153)
max_reward_scaled = max_reward_unscaled/alphabet_size

In [3]:
import math

math.factorial(alphabet_size)*alphabet_size

600

In [4]:
import sys
from typing import Any, Dict, Tuple, Union
import mlflow
import numpy as np

from stable_baselines3.common.logger import HumanOutputFormat, KVWriter, Logger

class MLflowOutputFormat(KVWriter):
    """
    Dumps key/value pairs into MLflow's numeric format.
    """

    def write(
        self,
        key_values: Dict[str, Any],
        key_excluded: Dict[str, Union[str, Tuple[str, ...]]],
        step: int = 0,
    ) -> None:

        for (key, value), (_, excluded) in zip(
            sorted(key_values.items()), sorted(key_excluded.items())
        ):

            if excluded is not None and "mlflow" in excluded:
                continue

            if isinstance(value, np.ScalarType):
                if not isinstance(value, str):
                    mlflow.log_metric(key, value, step)


loggers = Logger(
    folder=None,
    output_formats=[HumanOutputFormat(sys.stdout), MLflowOutputFormat()],
)
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

In [6]:
from stable_baselines3.common.callbacks import BaseCallback, CallbackList

class PermutationLogCallback(BaseCallback):
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.best_permutation = []
        self.best_permutation_length=-1

    def _on_step(self) -> bool:
        infos = self.locals.get("infos", [])
        for _info in infos:
            if "superpermutation" in _info and "superpermutation_length" in _info:
                if self.best_permutation_length==-1 or (_info["superpermutation_length"] < self.best_permutation_length):
                    self.best_permutation_length = _info["superpermutation_length"]
                    self.best_permutation = " ".join([str(i) for i in _info["superpermutation"]])
                self.logger.record("superpermutation/best_superpermutation_length", self.best_permutation_length)
                self.logger.record("superpermutation/best_superpermutation", self.best_permutation)
        return True


*The logs from the run below are from before I fixed a typo, permutation-->superpermutation*

In [7]:
# Instantiate the env
vec_env = make_vec_env(GymPermutationEnv, n_envs=16, env_kwargs=dict(alphabet_size=alphabet_size), vec_env_cls=SubprocVecEnv)
eval_env = make_vec_env(GymPermutationEnv, env_kwargs=dict(alphabet_size=alphabet_size), vec_env_cls=SubprocVecEnv)

# Set up hyperparameters
# More conservative settings to counter policy collapse
hp_policy_type = "MlpPolicy"
hp_learning_rate = 5e-5
hp_clip_range = 0.1
hp_batch_size = 128
hp_n_steps=4096
hp_seed = 42 # Not really a hyperparameter, unless we're extremely unlucky...
hp_training_timesteps = 2e8

# Train the agent
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=max_reward_scaled, verbose=1)
eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1, n_eval_episodes=20, deterministic=False)
permutation_metrics_callback = PermutationLogCallback()
callback_list = CallbackList([eval_callback, permutation_metrics_callback])

with mlflow.start_run():
    mlflow.log_param("alphabet_size", alphabet_size)

    mlflow.log_param("policy_type", hp_policy_type)
    mlflow.log_param("learning_rate", hp_learning_rate)
    mlflow.log_param("clip_range", hp_clip_range)
    mlflow.log_param("batch_size", hp_batch_size)
    mlflow.log_param("n_steps", hp_n_steps)
    mlflow.log_param("seed", hp_seed)
    mlflow.log_param("training_timesteps", hp_training_timesteps)

    model = PPO(hp_policy_type,
                vec_env,
                verbose=1,
                batch_size=hp_batch_size,
                clip_range=hp_clip_range,
                seed=hp_seed,
                n_steps=hp_n_steps,
                learning_rate=hp_learning_rate)
    # Set custom logger
    model.set_logger(loggers)
    model.learn(int(hp_training_timesteps), callback=callback_list)


Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 154      |
|    ep_rew_mean     | -48.4    |
| time/              |          |
|    fps             | 2092     |
|    iterations      | 1        |
|    time_elapsed    | 31       |
|    total_timesteps | 65536    |
| train/             |          |
|    learning_rate   | 5e-05    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 154          |
|    ep_rew_mean          | -48.7        |
| time/                   |              |
|    fps                  | 1273         |
|    iterations           | 2            |
|    time_elapsed         | 102          |
|    total_timesteps      | 131072       |
| train/                  |              |
|    approx_kl            | 0.0024733394 |
|    clip_fraction        | 0.0759       |
|    clip_range           | 0.1          |
|    entropy_los

KeyboardInterrupt: 

In [8]:
print(permutation_metrics_callback.best_permutation_length)
print(permutation_metrics_callback.best_permutation)

179
2 3 5 4 1 2 5 3 4 1 2 5 4 3 1 2 4 5 3 1 4 2 5 3 1 4 5 2 3 1 5 2 4 1 3 2 5 4 1 3 5 2 4 1 5 3 2 4 1 3 5 4 2 1 3 2 4 5 1 3 4 2 5 1 3 4 5 2 1 4 3 5 2 1 4 2 3 5 1 4 3 2 5 1 4 3 5 1 2 4 3 5 1 4 2 3 1 5 4 2 3 1 4 5 3 2 1 4 5 3 2 1 5 4 3 2 1 5 2 4 3 1 5 2 3 4 1 5 2 3 1 4 5 2 1 3 4 5 1 2 3 4 5 1 3 2 4 1 5 3 4 2 1 5 3 4 2 1 3 5 4 1 2 3 5 4 5 3 1 2 4 3 1 2 5 4 1 3 2


After ~6 hours of training the model appears to reach a sub-optimal solution (179 vs 153 characters).

Interestingly, regardless of tested settings (for the clip range used in PPO and the learning rate) the model exhibits policy collapse-like behavior soon after reaching what appears to be the highest possible mean episodic reward for the particular training run. After that (judging from the training graphs) the model cyclically recovers and starts having issues again (characterized by a sharp drop in mean reward)