# This code demonstrates value of hyperparameter tuning (Example Gym: Anytrading Gym Environment)

![TradingReturns-2.JPG](attachment:TradingReturns-2.JPG)
### Code written by Chirag Mirani, March 7, 2022. 

In [1]:

# importing library
import numpy as np
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3 import PPO
import optuna
import gym
from torch import nn as nn
import joblib
import gym_anytrading
import yfinance as yf
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from matplotlib import pyplot as plt



In [2]:
# callback class for Optuna early stopping
# Reference: https://medium.com/analytics-vidhya/hyperparameter-tuning-using-optuna-for-finrl-8a49506d2741
class LoggingCallback:
    def __init__(self,threshold,trial_number,patience):
      '''
      threshold:int tolerance for increase in reward
      trial_number: int Prune after minimum number of trials
      patience: int patience for the threshold
      '''
      self.threshold = threshold
      self.trial_number  = trial_number
      self.patience = patience
      self.cb_list = [] #Trials list for which threshold is reached
    def __call__(self,study:optuna.study, frozen_trial:optuna.Trial):
      #Setting the best value in the current trial
      study.set_user_attr("previous_best_value", study.best_value)
      
      #Checking if the minimum number of trials have pass
      if frozen_trial.number >self.trial_number:
          previous_best_value = study.user_attrs.get("previous_best_value",None)
          #Checking if the previous and current objective values have the same sign
          if previous_best_value * study.best_value >=0:
              #Checking for the threshold condition
              if abs(previous_best_value-study.best_value) < self.threshold: 
                  self.cb_list.append(frozen_trial.number)
                  #If threshold is achieved for the patience amount of time
                  if len(self.cb_list)>self.patience:
                      print('The study stops now...')
                      print('With number',frozen_trial.number ,'and value ',frozen_trial.value)
                      print('The previous and current best values are {} and {} respectively'
                              .format(previous_best_value, study.best_value))
                      study.stop()


In [3]:
#read spy data
df = yf.download('SPY', start='2018-01-01', end='2022-02-06', progress=False)
df.drop("Close", axis=1, inplace=True)
df.rename(columns = {'Adj Close':'Close'}, inplace = True)
df.head()
df.fillna(0,inplace=True)
df.drop(df.tail(1).index,inplace=True) # drop last n rows
df.shape

(1031, 5)

In [4]:
# create environment
window_size = 1
start_index = window_size
end_index = df.shape[0]
env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)
env_maker = lambda: env
env = DummyVecEnv([env_maker])
env = VecNormalize(env, norm_obs=True, norm_reward=True)


In [5]:
n_cpu = 8

## define PPO hyperparameters

def optimize_ppo(trial):
    """ Learning hyperparamters we want to optimise"""
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])
    net_arch = { "small": [dict(pi=[64, 64], vf=[64, 64])], "medium": [dict(pi=[256, 256], vf=[256, 256])],}[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
    ortho_init = False
    return {
         'gamma': trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]),
         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.),
          'n_steps': trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
           'n_epochs': trial.suggest_categorical("n_epochs", [1, 5, 10, 20]),
            'batch_size': trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512]),
            'clip_range': trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4]),
            'vf_coef': trial.suggest_uniform("vf_coef", 0, 1),
            'ent_coef': trial.suggest_loguniform("ent_coef", 0.00000001, 0.1),
            'max_grad_norm': trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]),
            'gae_lambda': trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]),
            "policy_kwargs": dict(net_arch=net_arch,activation_fn=activation_fn, ortho_init=ortho_init,)
         }

# define optimization agent for Optuna

def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_ppo(trial)
    #env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])
    #env = gym.make('CartPole-v1')
    #env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

    model = PPO('MlpPolicy', env, verbose=0, **model_params, seed = 42)
    model.learn(1000)
    model.save('PPO_{}.pth'.format(trial.number))

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    while n_episodes < 1:
        
        action, _ = model.predict(obs, deterministic = True)
        obs, reward, done, info = env.step(action)
        #print("Observation:", obs)
        #print("Action:", action)
        #print("Reward:", reward)
        #print(type(info[0]["total_profit"]))
        #print("Info:Total Profit: ", info[0]["total_profit"])
        #reward_sum += reward
        #print(reward)
        #print(reward_sum)

        if done:
            rewards = (info[0]["total_profit"]-1)*100
            print(info)
            #print(rewards)
            n_episodes += 1
            obs = env.reset()
    #env.render()
    print(rewards)
    last_reward = rewards
    #trial.report(-1 * last_reward)
    return last_reward


In [6]:
# set an optuna sampler, TPE is the best
sampler = optuna.samplers.TPESampler(seed=42)

# setup study environment for Optuna
study = optuna.create_study(study_name="PPO_study",direction='maximize',
                            sampler = sampler, pruner=optuna.pruners.HyperbandPruner())

# setup early callback
logging_callback = LoggingCallback(threshold=1e-5,patience=30,trial_number=5)


[32m[I 2022-03-07 06:51:55,616][0m A new study created in memory with name: PPO_study[0m


In [7]:
# run optuna for ten trials
study.optimize(optimize_agent, n_trials=100,catch=(ValueError,),callbacks=[logging_callback])


[32m[I 2022-03-07 06:52:35,805][0m Trial 0 finished with value: -40.786509067903154 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.99, 'learning_rate': 0.7072114131472232, 'n_steps': 8, 'n_epochs': 20, 'batch_size': 8, 'clip_range': 0.3, 'vf_coef': 0.3046137691733707, 'ent_coef': 4.827305651975693e-08, 'max_grad_norm': 0.9, 'gae_lambda': 0.95}. Best is trial 0 with value: -40.786509067903154.[0m


[{'total_reward': 22.570205688476562, 'total_profit': 0.5921349093209685, 'position': 0, 'terminal_observation': array([[2.0134435, 1.1063392]], dtype=float32)}]
-40.786509067903154


[32m[I 2022-03-07 06:52:44,824][0m Trial 1 finished with value: -47.50928994060901 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.005169879442298541, 'n_steps': 64, 'n_epochs': 5, 'batch_size': 16, 'clip_range': 0.3, 'vf_coef': 0.1195942459383017, 'ent_coef': 0.0009833622008382904, 'max_grad_norm': 0.6, 'gae_lambda': 0.95}. Best is trial 0 with value: -40.786509067903154.[0m


[{'total_reward': 195.58587646484375, 'total_profit': 0.5249071005939099, 'position': 0, 'terminal_observation': array([[1.9815803, 1.107508 ]], dtype=float32)}]
-47.50928994060901


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=16 and n_envs=1)
[32m[I 2022-03-07 06:52:50,075][0m Trial 2 finished with value: -93.43252224079994 and parameters: {'net_arch': 'small', 'activation_fn': 'tanh', 'gamma': 0.9, 'learning_rate': 0.004974062174968407, 'n_steps': 16, 'n_epochs': 1, 'batch_size': 512, 'clip_range': 0.1, 'vf_coef': 0.2848404943774676, 'ent_coef': 1.8122104544785752e-08, 'max_grad_norm': 5, 'gae_lambda': 0.92}. Best is trial 0 with value: -40.786509067903154.[0m


[{'total_reward': 78.60812377929688, 'total_profit': 0.06567477759200059, 'position': 1, 'terminal_observation': array([[1.9840415, 1.1068772]], dtype=float32)}]
-93.43252224079994


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=128 and n_envs=1)
[32m[I 2022-03-07 06:52:54,655][0m Trial 3 finished with value: -8.336663587792792 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.00013566845397879244, 'n_steps': 128, 'n_epochs': 1, 'batch_size': 256, 'clip_range': 0.4, 'vf_coef': 0.8870864242651173, 'ent_coef': 0.002878252039052746, 'max_grad_norm': 0.7, 'gae_lambda': 0.99}. Best is trial 3 with value: -8.336663587792792.[0m


[{'total_reward': -6.63128662109375, 'total_profit': 0.9166333641220721, 'position': 0, 'terminal_observation': array([[1.975669 , 1.1073258]], dtype=float32)}]
-8.336663587792792


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8 and n_envs=1)
[32m[I 2022-03-07 06:53:36,641][0m Trial 4 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.0009236099934536367, 'n_steps': 8, 'n_epochs': 20, 'batch_size': 512, 'clip_range': 0.1, 'vf_coef': 0.8511366715168569, 'ent_coef': 1.6536897559229541e-06, 'max_grad_norm': 2, 'gae_lambda': 0.9}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9831537, 1.1071886]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:53:45,769][0m Trial 5 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.0004897515106470033, 'n_steps': 1024, 'n_epochs': 5, 'batch_size': 16, 'clip_range': 0.4, 'vf_coef': 0.5163003483011953, 'ent_coef': 1.822616554559885e-06, 'max_grad_norm': 0.9, 'gae_lambda': 0.98}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9777151, 1.1074357]], dtype=float32)}]
0.0


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8 and n_envs=1)
[32m[I 2022-03-07 06:53:56,359][0m Trial 6 finished with value: -59.06697771806999 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.99, 'learning_rate': 1.5982474183666166e-05, 'n_steps': 8, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.3, 'vf_coef': 0.3881699262065219, 'ent_coef': 0.0003184326045652435, 'max_grad_norm': 0.8, 'gae_lambda': 0.99}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 9.098602294921875, 'total_profit': 0.40933022281930004, 'position': 0, 'terminal_observation': array([[1.9827738, 1.1073221]], dtype=float32)}]
-59.06697771806999


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8 and n_envs=1)
[32m[I 2022-03-07 06:54:12,321][0m Trial 7 finished with value: 0.0 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.95, 'learning_rate': 0.10818665579547869, 'n_steps': 8, 'n_epochs': 10, 'batch_size': 256, 'clip_range': 0.3, 'vf_coef': 0.6318372121697993, 'ent_coef': 1.3775857884996336e-05, 'max_grad_norm': 0.8, 'gae_lambda': 0.92}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9865925, 1.1072367]], dtype=float32)}]
0.0


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=32 and n_envs=1)
[32m[I 2022-03-07 06:54:17,114][0m Trial 8 finished with value: -5.472059910746763 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.0007131990160433632, 'n_steps': 32, 'n_epochs': 1, 'batch_size': 256, 'clip_range': 0.1, 'vf_coef': 0.22359583851945264, 'ent_coef': 0.055278694270483, 'max_grad_norm': 0.9, 'gae_lambda': 1.0}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 8.76397705078125, 'total_profit': 0.9452794008925324, 'position': 0, 'terminal_observation': array([[1.9825628, 1.1073962]], dtype=float32)}]
-5.472059910746763


[32m[I 2022-03-07 06:54:25,718][0m Trial 9 finished with value: -88.08606566466939 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.95, 'learning_rate': 2.4472209666467123e-05, 'n_steps': 16, 'n_epochs': 5, 'batch_size': 16, 'clip_range': 0.1, 'vf_coef': 0.9699143978146032, 'ent_coef': 0.007849327721966518, 'max_grad_norm': 2, 'gae_lambda': 0.92}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 70.540283203125, 'total_profit': 0.11913934335330617, 'position': 1, 'terminal_observation': array([[1.9832027, 1.1072181]], dtype=float32)}]
-88.08606566466939


[32m[I 2022-03-07 06:54:31,605][0m Trial 10 finished with value: -63.02076820492783 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.9999, 'learning_rate': 0.022716651949429716, 'n_steps': 512, 'n_epochs': 20, 'batch_size': 512, 'clip_range': 0.2, 'vf_coef': 0.7446608994507358, 'ent_coef': 4.3577081283834707e-07, 'max_grad_norm': 1, 'gae_lambda': 0.9}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 182.80718994140625, 'total_profit': 0.3697923179507217, 'position': 1, 'terminal_observation': array([[1.9802263, 1.1073503]], dtype=float32)}]
-63.02076820492783


[32m[I 2022-03-07 06:54:41,031][0m Trial 11 finished with value: -86.42968429474497 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.0003903293879717587, 'n_steps': 1024, 'n_epochs': 20, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.5767234382767683, 'ent_coef': 3.927122850317062e-06, 'max_grad_norm': 2, 'gae_lambda': 0.98}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 37.275909423828125, 'total_profit': 0.13570315705255026, 'position': 1, 'terminal_observation': array([[1.9777559, 1.1074603]], dtype=float32)}]
-86.42968429474497


[32m[I 2022-03-07 06:54:50,875][0m Trial 12 finished with value: -92.97708445403224 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.000517602590626729, 'n_steps': 2048, 'n_epochs': 10, 'batch_size': 128, 'clip_range': 0.4, 'vf_coef': 0.7748987978452406, 'ent_coef': 1.8753881601924394e-06, 'max_grad_norm': 0.5, 'gae_lambda': 0.98}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 84.44039916992188, 'total_profit': 0.07022915545967771, 'position': 0, 'terminal_observation': array([[1.9750757, 1.107309 ]], dtype=float32)}]
-92.97708445403224


[32m[I 2022-03-07 06:54:59,590][0m Trial 13 finished with value: -87.88218324046984 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.98, 'learning_rate': 9.542121513663582e-05, 'n_steps': 1024, 'n_epochs': 5, 'batch_size': 16, 'clip_range': 0.2, 'vf_coef': 0.4723219616672015, 'ent_coef': 5.955657355304248e-05, 'max_grad_norm': 0.3, 'gae_lambda': 0.8}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 36.399810791015625, 'total_profit': 0.12117816759530158, 'position': 0, 'terminal_observation': array([[1.973399 , 1.1074029]], dtype=float32)}]
-87.88218324046984


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=256 and n_envs=1)
[32m[I 2022-03-07 06:55:06,133][0m Trial 14 finished with value: -82.8252918880469 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.0020933048526842564, 'n_steps': 256, 'n_epochs': 20, 'batch_size': 512, 'clip_range': 0.1, 'vf_coef': 0.00319973164351095, 'ent_coef': 2.4781349225601106e-07, 'max_grad_norm': 2, 'gae_lambda': 0.9}. Best is trial 4 with value: 0.0.[0m


[{'total_reward': 93.42726135253906, 'total_profit': 0.17174708111953102, 'position': 1, 'terminal_observation': array([[1.9719421, 1.1074847]], dtype=float32)}]
-82.8252918880469


[32m[I 2022-03-07 06:55:13,262][0m Trial 15 finished with value: 73.59473975795929 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.022328246493424832, 'n_steps': 1024, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.6896272954056348, 'ent_coef': 3.767106110926481e-05, 'max_grad_norm': 0.9, 'gae_lambda': 0.98}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.735947397579593, 'position': 1, 'terminal_observation': array([[1.9706644, 1.1075565]], dtype=float32)}]
73.59473975795929


[32m[I 2022-03-07 06:55:28,967][0m Trial 16 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.98, 'learning_rate': 0.024134734737327433, 'n_steps': 256, 'n_epochs': 20, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.7227447764014598, 'ent_coef': 0.0001221038877619964, 'max_grad_norm': 0.3, 'gae_lambda': 0.9}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9695349, 1.10762  ]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:55:38,402][0m Trial 17 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.98, 'learning_rate': 0.033939257228253514, 'n_steps': 256, 'n_epochs': 10, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.6840393025192109, 'ent_coef': 8.976619520111053e-05, 'max_grad_norm': 0.3, 'gae_lambda': 1.0}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.968529 , 1.1076767]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:55:47,495][0m Trial 18 finished with value: -12.166286450847608 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.98, 'learning_rate': 0.2213588828347631, 'n_steps': 256, 'n_epochs': 10, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.6681360170151939, 'ent_coef': 2.884077253783115e-05, 'max_grad_norm': 0.3, 'gae_lambda': 1.0}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 237.8302459716797, 'total_profit': 0.8783371354915239, 'position': 0, 'terminal_observation': array([[1.9676275, 1.1077276]], dtype=float32)}]
-12.166286450847608


[32m[I 2022-03-07 06:56:02,344][0m Trial 19 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.020081837318828715, 'n_steps': 32, 'n_epochs': 20, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.9703798049018052, 'ent_coef': 0.00021163661095732641, 'max_grad_norm': 0.6, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9668151, 1.1077735]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:56:09,497][0m Trial 20 finished with value: 73.59473975795929 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.12251081913489154, 'n_steps': 512, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.9677532509378732, 'ent_coef': 0.01635095290637031, 'max_grad_norm': 0.6, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.735947397579593, 'position': 1, 'terminal_observation': array([[1.9660791, 1.1078151]], dtype=float32)}]
73.59473975795929


[32m[I 2022-03-07 06:56:18,852][0m Trial 21 finished with value: 73.59473975795929 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.07538665522109651, 'n_steps': 512, 'n_epochs': 10, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.8291885190386503, 'ent_coef': 0.07165302054986454, 'max_grad_norm': 0.6, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.735947397579593, 'position': 1, 'terminal_observation': array([[1.9654093, 1.107853 ]], dtype=float32)}]
73.59473975795929


[32m[I 2022-03-07 06:56:26,029][0m Trial 22 finished with value: 73.59473975795929 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.11215172756473296, 'n_steps': 512, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.8464410324413747, 'ent_coef': 0.06990654338786134, 'max_grad_norm': 0.6, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.735947397579593, 'position': 1, 'terminal_observation': array([[1.964797 , 1.1078877]], dtype=float32)}]
73.59473975795929


[32m[I 2022-03-07 06:56:33,179][0m Trial 23 finished with value: 73.59473975795929 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.6452858587142962, 'n_steps': 512, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.8209691961011336, 'ent_coef': 0.05865843267834231, 'max_grad_norm': 0.6, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.735947397579593, 'position': 1, 'terminal_observation': array([[1.9642353, 1.1079196]], dtype=float32)}]
73.59473975795929


[32m[I 2022-03-07 06:56:39,577][0m Trial 24 finished with value: -51.17863701901136 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.8610738357138681, 'n_steps': 512, 'n_epochs': 10, 'batch_size': 128, 'clip_range': 0.2, 'vf_coef': 0.9875628911262899, 'ent_coef': 0.012131780334775757, 'max_grad_norm': 0.6, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 7.0600738525390625, 'total_profit': 0.48821362980988636, 'position': 0, 'terminal_observation': array([[1.963718 , 1.1079489]], dtype=float32)}]
-51.17863701901136


[32m[I 2022-03-07 06:56:52,557][0m Trial 25 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.10923333568383242, 'n_steps': 512, 'n_epochs': 5, 'batch_size': 8, 'clip_range': 0.4, 'vf_coef': 0.8988729597870736, 'ent_coef': 0.09399575447454882, 'max_grad_norm': 0.6, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9632401, 1.1079761]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:56:59,993][0m Trial 26 finished with value: -58.79506217577397 and parameters: {'net_arch': 'small', 'activation_fn': 'tanh', 'gamma': 0.999, 'learning_rate': 0.35009678160781055, 'n_steps': 512, 'n_epochs': 10, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.8183311850727886, 'ent_coef': 0.002663675272446221, 'max_grad_norm': 0.6, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': -1.8119659423828125, 'total_profit': 0.41204937824226023, 'position': 0, 'terminal_observation': array([[1.9627973, 1.1080012]], dtype=float32)}]
-58.79506217577397


[32m[I 2022-03-07 06:57:09,984][0m Trial 27 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9, 'learning_rate': 0.009924329778190915, 'n_steps': 2048, 'n_epochs': 5, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.5957468788354443, 'ent_coef': 0.024009924442051717, 'max_grad_norm': 5, 'gae_lambda': 0.98}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9623396, 1.107903 ]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:57:17,258][0m Trial 28 finished with value: 73.59473975795929 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.06152274523752976, 'n_steps': 512, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.8245176197957264, 'ent_coef': 0.053060936633462695, 'max_grad_norm': 0.6, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.735947397579593, 'position': 1, 'terminal_observation': array([[1.9619641, 1.1079284]], dtype=float32)}]
73.59473975795929


[32m[I 2022-03-07 06:57:29,589][0m Trial 29 finished with value: -79.43063202182609 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.99, 'learning_rate': 0.402455972195077, 'n_steps': 512, 'n_epochs': 5, 'batch_size': 8, 'clip_range': 0.2, 'vf_coef': 0.9228597661007635, 'ent_coef': 0.004266633728126567, 'max_grad_norm': 0.5, 'gae_lambda': 0.95}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 40.78425598144531, 'total_profit': 0.2056936797817391, 'position': 0, 'terminal_observation': array([[1.9616132, 1.1079522]], dtype=float32)}]
-79.43063202182609


[32m[I 2022-03-07 06:57:36,838][0m Trial 30 finished with value: 73.59473975795929 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.17451468945377374, 'n_steps': 64, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.3, 'vf_coef': 0.4849709621354268, 'ent_coef': 0.0007088525381379704, 'max_grad_norm': 0.9, 'gae_lambda': 0.98}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.735947397579593, 'position': 1, 'terminal_observation': array([[1.9612843, 1.1079746]], dtype=float32)}]
73.59473975795929


[32m[I 2022-03-07 06:57:44,030][0m Trial 31 finished with value: 73.59473975795929 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.9709533007752007, 'n_steps': 128, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.7846641670827885, 'ent_coef': 0.029709878105960683, 'max_grad_norm': 0.7, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.735947397579593, 'position': 1, 'terminal_observation': array([[1.9609754, 1.1079956]], dtype=float32)}]
73.59473975795929


[32m[I 2022-03-07 06:57:51,549][0m Trial 32 finished with value: 73.59473975795929 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.053480391202355246, 'n_steps': 1024, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.7159325067850761, 'ent_coef': 0.0012892508915112347, 'max_grad_norm': 0.6, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.735947397579593, 'position': 1, 'terminal_observation': array([[1.9606849, 1.1080154]], dtype=float32)}]
73.59473975795929


[32m[I 2022-03-07 06:57:58,759][0m Trial 33 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.3364653032447293, 'n_steps': 64, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.3, 'vf_coef': 0.46557270815641805, 'ent_coef': 0.0007485716459753419, 'max_grad_norm': 1, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9604112, 1.108034 ]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:58:05,942][0m Trial 34 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.842995052117955, 'n_steps': 128, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.3, 'vf_coef': 0.783956851846054, 'ent_coef': 0.026997979364218692, 'max_grad_norm': 0.7, 'gae_lambda': 0.95}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9601526, 1.1080517]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:58:10,935][0m Trial 35 finished with value: 0.0 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.057039231141482724, 'n_steps': 64, 'n_epochs': 1, 'batch_size': 64, 'clip_range': 0.3, 'vf_coef': 0.3975472111807058, 'ent_coef': 0.0010704870093514464, 'max_grad_norm': 0.7, 'gae_lambda': 0.99}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9599082, 1.1080682]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:58:16,656][0m Trial 36 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9, 'learning_rate': 0.008267981042409136, 'n_steps': 128, 'n_epochs': 5, 'batch_size': 128, 'clip_range': 0.3, 'vf_coef': 0.37343334257083427, 'ent_coef': 0.001501751103587757, 'max_grad_norm': 0.7, 'gae_lambda': 0.8}. Best is trial 15 with value: 73.59473975795929.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9596767, 1.1080841]], dtype=float32)}]
0.0
The study stops now...
With number 36 and value  0.0
The previous and current best values are 73.59473975795929 and 73.59473975795929 respectively


In [8]:
# record and best parameters
trial = study.best_trial
print(study.best_trial.number)
print('Reward: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))
#joblib.dump(study, "final_ddpg_study__.pkl")

15
Reward: 73.59473975795929
Best hyperparameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.022328246493424832, 'n_steps': 1024, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.6896272954056348, 'ent_coef': 3.767106110926481e-05, 'max_grad_norm': 0.9, 'gae_lambda': 0.98}


In [9]:
window_size = 1
start_index = window_size
end_index = df.shape[0]
env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)
env_maker = lambda: env
env = DummyVecEnv([env_maker])
env = VecNormalize(env, norm_obs=True, norm_reward=True)


In [10]:
#now use the best model and check your score
#env = gym.make('CartPole-v1')
#env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

#use saved model: study.best_trial.number
#print(study.best_trial.number)
tuned_model_PPO = PPO.load('PPO_{}.pth'.format(study.best_trial.number),env=env)

rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 1:
    #env.render()
    action, _ = tuned_model_PPO.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    if(action[0])==0:
        print(obs)
        print("SELL")
    #reward_sum += reward
    #print(action)

    if done:
        rewards = (info[0]["total_profit"]-1)*100
        n_episodes += 1
        obs = env.reset()


env.close()
last_reward = rewards
#trial.report(-1 * last_reward)
print(last_reward)


73.59473975795929


In [11]:
# Now train your own model without tuning
#env = gym.make('CartPole-v1')
#env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

#study.best_trial.number
#tuned_model_PPO = PPO.load('PPO_{}.pth'.format(study.best_trial.number),env=env)
nontuned_model_PPO = PPO('MlpPolicy', env, verbose=0)
nontuned_model_PPO.learn(1000)
    


<stable_baselines3.ppo.ppo.PPO at 0x1e64441ca00>

In [12]:
# compare the scores
rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 1:
    
    action, _ = nontuned_model_PPO.predict(obs, deterministic=True)
    #print(action)
    obs, reward, done, info = env.step(action)
    #reward_sum += reward

    if done:
        rewards = (info[0]["total_profit"]-1)*100
        n_episodes += 1
        obs = env.reset()


env.close()
last_reward = rewards
#trial.report(-1 * last_reward)
print(last_reward)

-64.1997601642524


In [13]:
# check score for random sample...
rewards = []
n_episodes, reward_sum = 0, 0.0
#env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

obs = env.reset()
#env.render()
while n_episodes < 1:
    #action, _ = env.action_space.sample()
    #action, _ = tuned_model_PPO.predict(env.action_space.sample())
    action = env.action_space.sample()
    #print(action)
    obs, reward, done, info = env.step([action])
    #reward_sum += reward

    if done:
        rewards = (info[0]["total_profit"]-1)*100
        n_episodes += 1
        obs = env.reset()


env.close()
last_reward = rewards
#trial.report(-1 * last_reward)
print(last_reward)


-96.55088433560411


In [14]:
#Now build your own model from the parameters and run it
#env = gym.make('CartPole-v1')
#env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

trial = study.best_trial
#print(trial.number)
model_params = optimize_ppo(trial)
best_trial_parameters_model = PPO('MlpPolicy', env, verbose=0, **model_params)
best_trial_parameters_model.learn(1000)
rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 1:
    #env.render()
    action, _ = best_trial_parameters_model.predict(obs, deterministic = True)
    #print(action)
    obs, reward, done, info = env.step(action)
    #reward_sum += reward

    if done:
        rewards = (info[0]["total_profit"]-1)*100
        n_episodes += 1
        obs = env.reset()


env.close()
last_reward = rewards
#trial.report(-1 * last_reward)
print(last_reward)

-46.25124461654533


In [15]:
#Now build your own model from the parameters and run it for longer time
#env = gym.make('CartPole-v1')
#env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

trial = study.best_trial
#print(trial.number)
model_params = optimize_ppo(trial)
best_trial_parameters_model = PPO('MlpPolicy', env, verbose=0, **model_params)
best_trial_parameters_model.learn(10000)
rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 1:
    #env.render()
    action, _ = best_trial_parameters_model.predict(obs, deterministic=True)
      
    obs, reward, done, info = env.step(action)
    #print(action)
    #reward_sum += reward

    if done:
        rewards = (info[0]["total_profit"]-1)*100
        n_episodes += 1
        obs = env.reset()


env.close()
last_reward = rewards
#trial.report(-1 * last_reward)
print(last_reward)

73.59473975795929
