# Finding short superpermutations for n=5

In [1]:
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from GymPermutationsEnv import GymPermutationEnv

In [2]:
import permutation_utils

alphabet_size = 5

max_reward_unscaled = permutation_utils.get_max_possible_reward(5, 153)
max_reward_scaled = max_reward_unscaled/alphabet_size

In [3]:
import math

math.factorial(alphabet_size)*alphabet_size

600

In [4]:
import sys
from typing import Any, Dict, Tuple, Union
import mlflow
import numpy as np

from stable_baselines3.common.logger import HumanOutputFormat, KVWriter, Logger

class MLflowOutputFormat(KVWriter):
    """
    Dumps key/value pairs into MLflow's numeric format.
    """

    def write(
        self,
        key_values: Dict[str, Any],
        key_excluded: Dict[str, Union[str, Tuple[str, ...]]],
        step: int = 0,
    ) -> None:

        for (key, value), (_, excluded) in zip(
            sorted(key_values.items()), sorted(key_excluded.items())
        ):

            if excluded is not None and "mlflow" in excluded:
                continue

            if isinstance(value, np.ScalarType):
                if not isinstance(value, str):
                    mlflow.log_metric(key, value, step)


loggers = Logger(
    folder=None,
    output_formats=[HumanOutputFormat(sys.stdout), MLflowOutputFormat()],
)
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

In [5]:
# Instantiate the env
vec_env = make_vec_env(GymPermutationEnv, n_envs=16, env_kwargs=dict(alphabet_size=alphabet_size), vec_env_cls=SubprocVecEnv)
eval_env = make_vec_env(GymPermutationEnv, env_kwargs=dict(alphabet_size=alphabet_size), vec_env_cls=SubprocVecEnv)

# Set up hyperparameters
# More conservative settings to counter policy collapse
hp_policy_type = "MlpPolicy"
hp_learning_rate = 5e-5
hp_clip_range = 0.1
hp_batch_size = 128
hp_n_steps=4096
hp_seed = 42 # Not really a hyperparameter, unless we're extremely unlucky...
hp_training_timesteps = 2e8

# Train the agent
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=max_reward_scaled, verbose=1)
eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1, n_eval_episodes=20, deterministic=False)

with mlflow.start_run():
    mlflow.log_param("alphabet_size", alphabet_size)

    mlflow.log_param("policy_type", hp_policy_type)
    mlflow.log_param("learning_rate", hp_learning_rate)
    mlflow.log_param("clip_range", hp_clip_range)
    mlflow.log_param("batch_size", hp_batch_size)
    mlflow.log_param("n_steps", hp_n_steps)
    mlflow.log_param("seed", hp_seed)
    mlflow.log_param("training_timesteps", hp_training_timesteps)

    model = PPO(hp_policy_type,
                vec_env,
                verbose=1,
                batch_size=hp_batch_size,
                clip_range=hp_clip_range,
                seed=hp_seed,
                n_steps=hp_n_steps,
                learning_rate=hp_learning_rate)
    # Set custom logger
    model.set_logger(loggers)
    model.learn(int(hp_training_timesteps), callback=eval_callback)


Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 154      |
|    ep_rew_mean     | -48.4    |
| time/              |          |
|    fps             | 1985     |
|    iterations      | 1        |
|    time_elapsed    | 33       |
|    total_timesteps | 65536    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 154          |
|    ep_rew_mean          | -48.7        |
| time/                   |              |
|    fps                  | 1256         |
|    iterations           | 2            |
|    time_elapsed         | 104          |
|    total_timesteps      | 131072       |
| train/                  |              |
|    approx_kl            | 0.0024733394 |
|    clip_fraction        | 0.0759       |
|    clip_range           | 0.1          |
|    entropy_loss         | -4.79        |
|    explained_variance   | -0.0165      

KeyboardInterrupt: 

In [None]:
# Test the trained agent
vec_env = make_vec_env(GymPermutationEnv, n_envs=1, env_kwargs=dict(alphabet_size=alphabet_size))
obs = vec_env.reset()
n_steps = math.factorial(alphabet_size)
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=True)
    print(f"Step {step + 1}")
    print("Action: ", action)
    obs, reward, done, info = vec_env.step(action)
    print("obs=", obs, "reward=", reward, "done=", done)
    vec_env.render()
    if done:
        # Note that the VecEnv resets automatically
        # when a done signal is encountered
        print("Goal reached!", "reward=", reward)
        break