# This code demonstrates value of hyperparameter tuning (Example Gym: CartPole-V1 Environment)
![TuningReward.JPG](attachment:TuningReward.JPG)

### Code written by Chirag Mirani, March 4, 2022. 

In [11]:

# importing library
import numpy as np
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3 import PPO
import optuna
import gym
from torch import nn as nn
import joblib


In [12]:
# callback class for Optuna early stopping
# Reference: https://medium.com/analytics-vidhya/hyperparameter-tuning-using-optuna-for-finrl-8a49506d2741
class LoggingCallback:
    def __init__(self,threshold,trial_number,patience):
      '''
      threshold:int tolerance for increase in sharpe ratio
      trial_number: int Prune after minimum number of trials
      patience: int patience for the threshold
      '''
      self.threshold = threshold
      self.trial_number  = trial_number
      self.patience = patience
      self.cb_list = [] #Trials list for which threshold is reached
    def __call__(self,study:optuna.study, frozen_trial:optuna.Trial):
      #Setting the best value in the current trial
      study.set_user_attr("previous_best_value", study.best_value)
      
      #Checking if the minimum number of trials have pass
      if frozen_trial.number >self.trial_number:
          previous_best_value = study.user_attrs.get("previous_best_value",None)
          #Checking if the previous and current objective values have the same sign
          if previous_best_value * study.best_value >=0:
              #Checking for the threshold condition
              if abs(previous_best_value-study.best_value) < self.threshold: 
                  self.cb_list.append(frozen_trial.number)
                  #If threshold is achieved for the patience amount of time
                  if len(self.cb_list)>self.patience:
                      print('The study stops now...')
                      print('With number',frozen_trial.number ,'and value ',frozen_trial.value)
                      print('The previous and current best values are {} and {} respectively'
                              .format(previous_best_value, study.best_value))
                      study.stop()


In [13]:
n_cpu = 8

## define PPO hyperparameters

def optimize_ppo(trial):
    """ Learning hyperparamters we want to optimise"""
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])
    net_arch = { "small": [dict(pi=[64, 64], vf=[64, 64])], "medium": [dict(pi=[256, 256], vf=[256, 256])],}[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
    ortho_init = False
    return {
         'gamma': trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]),
         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.),
          'n_steps': trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
           'n_epochs': trial.suggest_categorical("n_epochs", [1, 5, 10, 20]),
            'batch_size': trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512]),
            'clip_range': trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4]),
            'vf_coef': trial.suggest_uniform("vf_coef", 0, 1),
            'ent_coef': trial.suggest_loguniform("ent_coef", 0.00000001, 0.1),
            'max_grad_norm': trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]),
            'gae_lambda': trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]),
            "policy_kwargs": dict(net_arch=net_arch,activation_fn=activation_fn, ortho_init=ortho_init,)
         }

# define optimization agent for Optuna

def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_ppo(trial)
    #env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])
    env = gym.make('CartPole-v1')
    model = PPO('MlpPolicy', env, verbose=0, **model_params)
    model.learn(10000)
    model.save('PPO_{}.pth'.format(trial.number))

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    while n_episodes < 4:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    #trial.report(-1 * last_reward)
    return last_reward


In [14]:
# set an optuna sampler, TPE is the best
sampler = optuna.samplers.TPESampler(seed=42)

# setup study environment for Optuna
study = optuna.create_study(study_name="PPO_study",direction='maximize',
                            sampler = sampler, pruner=optuna.pruners.HyperbandPruner())

# setup early callback
logging_callback = LoggingCallback(threshold=1e-5,patience=30,trial_number=5)


[32m[I 2022-03-07 06:52:20,715][0m A new study created in memory with name: PPO_study[0m


In [15]:
# run optuna for ten trials
study.optimize(optimize_agent, n_trials=10,catch=(ValueError,),callbacks=[logging_callback])


[32m[I 2022-03-07 07:01:11,328][0m Trial 0 finished with value: 9.5 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.99, 'learning_rate': 0.7072114131472232, 'n_steps': 8, 'n_epochs': 20, 'batch_size': 8, 'clip_range': 0.3, 'vf_coef': 0.3046137691733707, 'ent_coef': 4.827305651975693e-08, 'max_grad_norm': 0.9, 'gae_lambda': 0.95}. Best is trial 0 with value: 9.5.[0m
[32m[I 2022-03-07 07:02:14,266][0m Trial 1 finished with value: 500.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.005169879442298541, 'n_steps': 64, 'n_epochs': 5, 'batch_size': 16, 'clip_range': 0.3, 'vf_coef': 0.1195942459383017, 'ent_coef': 0.0009833622008382904, 'max_grad_norm': 0.6, 'gae_lambda': 0.95}. Best is trial 1 with value: 500.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=16 and n_envs=1)
[32m[I 2022-03-07 07:02:34,988][0m Trial 2 finished with value: 51.0 and parameters: {'n

In [16]:
# record and best parameters
trial = study.best_trial
print(study.best_trial.number)
print('Reward: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))
joblib.dump(study, "final_ddpg_study__.pkl")

1
Reward: 500.0
Best hyperparameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.005169879442298541, 'n_steps': 64, 'n_epochs': 5, 'batch_size': 16, 'clip_range': 0.3, 'vf_coef': 0.1195942459383017, 'ent_coef': 0.0009833622008382904, 'max_grad_norm': 0.6, 'gae_lambda': 0.95}


['final_ddpg_study__.pkl']

In [17]:
#now use the best model and check your score
env = gym.make('CartPole-v1')
#use saved model: study.best_trial.number
tuned_model_PPO = PPO.load('PPO_{}.pth'.format(study.best_trial.number),env=env)

rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 5:
    env.render()
    action, _ = tuned_model_PPO.predict(obs,deterministic=True)
    obs, reward, done, _ = env.step(action)
    reward_sum += reward

    if done:
        rewards.append(reward_sum)
        reward_sum = 0.0
        n_episodes += 1
        obs = env.reset()

env.close()
last_reward = np.mean(rewards)
#trial.report(-1 * last_reward)
print(last_reward)
env.close()

500.0


In [18]:
# Now train your own model without tuning
env = gym.make('CartPole-v1')
#study.best_trial.number
#tuned_model_PPO = PPO.load('PPO_{}.pth'.format(study.best_trial.number),env=env)
nontuned_model_PPO = PPO('MlpPolicy', env, verbose=0)
nontuned_model_PPO.learn(10000)
    


<stable_baselines3.ppo.ppo.PPO at 0x20d96e29730>

In [19]:
# compare the scores
rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
env.render()
while n_episodes < 5:
    env.render()
    action, _ = nontuned_model_PPO.predict(obs,deterministic=True)
    obs, reward, done, _ = env.step(action)
    reward_sum += reward

    if done:
        rewards.append(reward_sum)
        reward_sum = 0.0
        n_episodes += 1
        obs = env.reset()

env.close()
last_reward = np.mean(rewards)
#trial.report(-1 * last_reward)
print(last_reward)


347.4


In [21]:
# check score for random sample...
rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
env.render()
while n_episodes < 5:
    #action, _ = env.action_space.sample()
    #nontuned_model_PPO.predict(obs)
    env.render()
    obs, reward, done, _ = env.step(env.action_space.sample())
    reward_sum += reward

    if done:
        rewards.append(reward_sum)
        reward_sum = 0.0
        n_episodes += 1
        obs = env.reset()
env.close()
last_reward = np.mean(rewards)
#trial.report(-1 * last_reward)
print(last_reward)


28.2


In [None]:
#Now build your own model from the parameters and run it
env = gym.make('CartPole-v1')
trial = study.best_trial
print(trial.number)
model_params = optimize_ppo(trial)
best_trial_parameters_model = PPO('MlpPolicy', env, verbose=0, **model_params)
best_trial_parameters_model.learn(10000)
rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 5:
    env.render()
    action, _ = best_trial_parameters_model.predict(obs,deterministic=True)
    obs, reward, done, _ = env.step(action)
    reward_sum += reward

    if done:
        rewards.append(reward_sum)
        reward_sum = 0.0
        n_episodes += 1
        obs = env.reset()

env.close()
last_reward = np.mean(rewards)
#trial.report(-1 * last_reward)
print(last_reward)
env.close()

In [22]:
#Now build your own model from the parameters and run it for longer time
env = gym.make('CartPole-v1')
#use saved model: study.best_trial.number
#tuned_model_PPO = PPO.load('PPO_{}.pth'.format(study.best_trial.number),env=env)
trial = study.best_trial
print(trial.number)
model_params = optimize_ppo(trial)
best_trial_parameters_model = PPO('MlpPolicy', env, verbose=0, **model_params)
best_trial_parameters_model.learn(30000)
rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 5:
    env.render()
    action, _ = best_trial_parameters_model.predict(obs,deterministic=True)
    obs, reward, done, _ = env.step(action)
    reward_sum += reward

    if done:
        rewards.append(reward_sum)
        reward_sum = 0.0
        n_episodes += 1
        obs = env.reset()

env.close()
last_reward = np.mean(rewards)
#trial.report(-1 * last_reward)
print(last_reward)
env.close()

1
154.6
