In [1]:
from optuna.trial import Trial
import optuna
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from pink import PinkNoiseDist, PinkActionNoise
import multiprocessing
import FleetRL
from FleetRL.fleet_env.fleet_environment import FleetEnv
import optuna
import gymnasium as gym
from stable_baselines3 import PPO
import FleetRL
from stable_baselines3.common.evaluation import evaluate_policy
import time
import os

In [2]:
run_name = "ppo_full_vecnorm_clip5_hp_study_aux"

In [3]:
time_now = int(time.time())
trained_agents_dir = f"./trained/vec_ppo-{time_now}-{run_name}"
logs_dir = f"./logs/vec_ppo-{time_now}-{run_name}"

if not os.path.exists(trained_agents_dir):
    os.makedirs(trained_agents_dir)

if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

In [4]:
n_cpu = 4
train_vec_env = make_vec_env(FleetEnv,
                             n_envs=n_cpu,
                             vec_env_cls=SubprocVecEnv,
                             env_kwargs={
                                 "schedule_name": "lmd_sched_single.csv",
                                 "building_name": "load_lmd.csv",
                                 "include_building": True,
                                 "include_pv": True,
                                 "static_time_picker": False,
                                 "deg_emp": False,
                                 "include_price": True,
                                 "ignore_price_reward": False,
                                 "ignore_invalid_penalty": False,
                                 "ignore_overcharging_penalty": False,
                                 "ignore_overloading_penalty": False,
                                 "episode_length": 48,
                                 "normalize_in_env": False,
                                 "verbose": 0,
                                 "aux": True,
                                 "use_case": "lmd"
                             })

In [5]:
vec_norm_train_env = VecNormalize(venv=train_vec_env, training=True, norm_obs=True, norm_reward=True)

In [6]:
n_actions = vec_norm_train_env.action_space.shape[-1]
param_noise = None
noise_scale = 0.3
seq_len = 48 * 4
action_noise = PinkActionNoise(noise_scale, seq_len, n_actions)

In [None]:
def objective(trial: Trial):
    env = vec_norm_train_env
    gamma = trial.suggest_categorical('gamma', [0.9, 0.91, 0.95, 0.97, 0.99])
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 5e-5, 8e-5, 1e-4, 5e-4, 8e-4, 1e-3, 3e-3, 5e-3, 1e-2, 3e-2, 5e-2, 1e-1])
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256, 512, 1024])
    n_epochs = trial.suggest_int('n_epochs', 5, 50)
    gae_lamba = trial.suggest_categorical('gae_lambda', [0.9, 0.92, 0.95, 0.97, 0.99])
    clip_range = trial.suggest_categorical('clip_range', [0.1, 0.2, 0.3, 0.4, 0.5])
    clip_range_vf = trial.suggest_categorical('clip_range_vf', [None, 0.1, 0.2, 0.3, 0.4, 0.5])
    normalize_advantage = trial.suggest_categorical('normalize_advantage', [False, True])
    ent_coef = trial.suggest_categorical('ent_coef', [0.0, 1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4])
    vf_coef = trial.suggest_categorical('vf_coef', [0.1, 0.2, 0.5, 0.6, 0.7])
    max_grad_norm = trial.suggest_float('max_grad_norm', 0.3, 0.7)
    n_steps = trial.suggest_categorical('n_steps', [128, 256, 512, 1024, 2048, 4096, 8192])
    
    model = PPO(policy="MlpPolicy",
                env=vec_norm_train_env,
                verbose=0,
                learning_rate=learning_rate,
                gamma=gamma,
                batch_size=batch_size,
                tensorboard_log="./tb_log",
                n_epochs=n_epochs,
                gae_lambda=gae_lamba,
                clip_range=clip_range,
                clip_range_vf=clip_range_vf,
                normalize_advantage=normalize_advantage,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                n_steps=n_steps
                )

    model.learn(50000)

    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10, deterministic=True)
    return mean_reward

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print(study.best_params)
print(study.best_value)
print(study.best_trial)


[I 2023-06-19 14:29:11,388] A new study created in memory with name: no-name-5d762c57-565a-44a5-a5f6-59751362998e
[I 2023-06-19 14:44:44,264] Trial 0 finished with value: -11.6647562 and parameters: {'gamma': 0.91, 'learning_rate': 8e-05, 'batch_size': 32, 'n_epochs': 18, 'gae_lambda': 0.95, 'clip_range': 0.3, 'clip_range_vf': 0.2, 'normalize_advantage': True, 'ent_coef': 1e-05, 'vf_coef': 0.7, 'max_grad_norm': 0.5755110175519809, 'n_steps': 128}. Best is trial 0 with value: -11.6647562.
[I 2023-06-19 15:00:46,471] Trial 1 finished with value: -13.155219800000001 and parameters: {'gamma': 0.95, 'learning_rate': 0.0001, 'batch_size': 128, 'n_epochs': 47, 'gae_lambda': 0.95, 'clip_range': 0.4, 'clip_range_vf': 0.2, 'normalize_advantage': False, 'ent_coef': 5e-06, 'vf_coef': 0.1, 'max_grad_norm': 0.3411990748696677, 'n_steps': 2048}. Best is trial 0 with value: -11.6647562.
[I 2023-06-19 15:16:48,119] Trial 2 finished with value: -10.8012141 and parameters: {'gamma': 0.9, 'learning_rate':