In [None]:
import time
import os

import optuna
from optuna.trial import Trial

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, ProgressBarCallback
from stable_baselines3.common.logger import HParam
from stable_baselines3.common.evaluation import evaluate_policy

from FleetRL.fleet_env.fleet_environment import FleetEnv

In [None]:
run_name = "ppo_full_vecnorm_clip5_hp_study_aux"

In [None]:
time_now = int(time.time())
trained_agents_dir = f"./trained/vec_ppo-{time_now}-{run_name}"
logs_dir = f"./logs/vec_ppo-{time_now}-{run_name}"

if not os.path.exists(trained_agents_dir):
    os.makedirs(trained_agents_dir)

if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

In [None]:
n_cpu = 4
train_vec_env = make_vec_env(FleetEnv,
                             n_envs=n_cpu,
                             vec_env_cls=SubprocVecEnv,
                             env_kwargs={
                                 "schedule_name": "lmd_sched_single.csv",
                                 "building_name": "load_lmd.csv",
                                 "include_building": True,
                                 "include_pv": True,
                                 "time_picker": "random",
                                 "deg_emp": False,
                                 "include_price": True,
                                 "ignore_price_reward": False,
                                 "ignore_invalid_penalty": False,
                                 "ignore_overcharging_penalty": False,
                                 "ignore_overloading_penalty": False,
                                 "episode_length": 48,
                                 "normalize_in_env": False,
                                 "verbose": 0,
                                 "aux": True,
                                 "use_case": "lmd"
                             })

In [None]:
vec_norm_train_env = VecNormalize(venv=train_vec_env,
                                  training=True,
                                  norm_obs=True,
                                  norm_reward=True,
                                  clip_reward=10.0)

In [None]:
eval_vec_env = make_vec_env(FleetEnv,
                             n_envs=1,
                             vec_env_cls=SubprocVecEnv,
                             env_kwargs={
                                 "schedule_name": "lmd_sched_single.csv",
                                 "building_name": "load_lmd.csv",
                                 "include_building": True,
                                 "include_pv": True,
                                 "time_picker": "eval",
                                 "deg_emp": False,
                                 "include_price": True,
                                 "ignore_price_reward": False,
                                 "ignore_invalid_penalty": False,
                                 "ignore_overcharging_penalty": False,
                                 "ignore_overloading_penalty": False,
                                 "episode_length": 48,
                                 "normalize_in_env": False,
                                 "verbose": 0,
                                 "aux": True,
                                 "use_case": "lmd"
                             })

In [None]:
vec_norm_eval_env = VecNormalize(venv=eval_vec_env,
                                 training=True,
                                 norm_obs=True,
                                 norm_reward=True,
                                 clip_reward=10.0)

In [None]:
class HyperParamCallback(BaseCallback):
    """
    Saves hyperparameters and metrics at start of training, logging to tensorboard
    """

    def _on_training_start(self) -> None:
        hparam_dict = {
            "algorithm": self.model.__class__.__name__,
            "learning rate": self.model.learning_rate,
            "gamma": self.model.gamma,
            "gae lambda": self.model.gae_lambda,
            "batch size": self.model.batch_size,
            "ent_coef": self.model.ent_coef,
            "vf_coef": self.model.vf_coef,
            "max_grad_norm": self.model.max_grad_norm,
            "n_steps": self.model.n_steps
        }

        metric_dict = {
            "rollout/ep_len_mean": 0,
            "train/value_loss": 0.0
        }

        self.logger.record(
            "hparams",
            HParam(hparam_dict, metric_dict),
            exclude=("stdout", "log", "json", "csv")
        )

    def _on_step(self) -> bool:
        return True


In [None]:
hp_callback = HyperParamCallback()
progress_bar_callback = ProgressBarCallback()

In [None]:
def objective(trial: Trial):
    gamma = trial.suggest_categorical('gamma', [0.9, 0.91, 0.95, 0.97, 0.99])
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 5e-5, 8e-5, 1e-4, 5e-4, 8e-4, 1e-3, 3e-3, 5e-3, 1e-2, 3e-2, 5e-2, 1e-1])
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256, 512, 1024])
    n_epochs = trial.suggest_int('n_epochs', 5, 50)
    gae_lamba = trial.suggest_categorical('gae_lambda', [0.9, 0.92, 0.95, 0.97, 0.99])
    clip_range = trial.suggest_categorical('clip_range', [0.1, 0.2, 0.3, 0.4, 0.5])
    clip_range_vf = trial.suggest_categorical('clip_range_vf', [None, 0.1, 0.2, 0.3, 0.4, 0.5])
    normalize_advantage = trial.suggest_categorical('normalize_advantage', [False, True])
    ent_coef = trial.suggest_categorical('ent_coef', [0.0, 1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4])
    vf_coef = trial.suggest_categorical('vf_coef', [0.1, 0.2, 0.5, 0.6, 0.7])
    max_grad_norm = trial.suggest_float('max_grad_norm', 0.3, 0.7)
    n_steps = trial.suggest_categorical('n_steps', [128, 256, 512, 1024, 2048, 4096, 8192])
    
    model = PPO(policy="MlpPolicy",
                env=vec_norm_train_env,
                verbose=0,
                learning_rate=learning_rate,
                gamma=gamma,
                batch_size=batch_size,
                tensorboard_log="./tb_log_hp",
                n_epochs=n_epochs,
                gae_lambda=gae_lamba,
                clip_range=clip_range,
                clip_range_vf=clip_range_vf,
                normalize_advantage=normalize_advantage,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                n_steps=n_steps
                )

    model.learn(50000, callback=hp_callback)

    mean_reward, _ = evaluate_policy(model, env=vec_norm_eval_env, n_eval_episodes=1, deterministic=True)
    return mean_reward


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print(study.best_params)
print(study.best_value)
print(study.best_trial)