# This code demonstrates value of hyperparameter tuning (Example Gym: Anytrading Gym Environment)

### Code written by Chirag Mirani, March 4, 2022. 

In [18]:

# importing library
import numpy as np
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3 import PPO
import optuna
import gym
from torch import nn as nn
import joblib
import gym_anytrading
import yfinance as yf
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from matplotlib import pyplot as plt



In [19]:
# callback class for Optuna early stopping
# Reference: https://medium.com/analytics-vidhya/hyperparameter-tuning-using-optuna-for-finrl-8a49506d2741
class LoggingCallback:
    def __init__(self,threshold,trial_number,patience):
      '''
      threshold:int tolerance for increase in reward
      trial_number: int Prune after minimum number of trials
      patience: int patience for the threshold
      '''
      self.threshold = threshold
      self.trial_number  = trial_number
      self.patience = patience
      self.cb_list = [] #Trials list for which threshold is reached
    def __call__(self,study:optuna.study, frozen_trial:optuna.Trial):
      #Setting the best value in the current trial
      study.set_user_attr("previous_best_value", study.best_value)
      
      #Checking if the minimum number of trials have pass
      if frozen_trial.number >self.trial_number:
          previous_best_value = study.user_attrs.get("previous_best_value",None)
          #Checking if the previous and current objective values have the same sign
          if previous_best_value * study.best_value >=0:
              #Checking for the threshold condition
              if abs(previous_best_value-study.best_value) < self.threshold: 
                  self.cb_list.append(frozen_trial.number)
                  #If threshold is achieved for the patience amount of time
                  if len(self.cb_list)>self.patience:
                      print('The study stops now...')
                      print('With number',frozen_trial.number ,'and value ',frozen_trial.value)
                      print('The previous and current best values are {} and {} respectively'
                              .format(previous_best_value, study.best_value))
                      study.stop()


In [20]:
#read spy data
df = yf.download('SPY', start='2018-01-01', end='2022-02-06', progress=False)
df.drop("Close", axis=1, inplace=True)
df.rename(columns = {'Adj Close':'Close'}, inplace = True)
df.head()
df.fillna(0,inplace=True)
df.drop(df.tail(1).index,inplace=True) # drop last n rows
df.shape

(1031, 5)

In [4]:
# create environment
window_size = 1
start_index = window_size
end_index = df.shape[0]
env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)
env_maker = lambda: env
env = DummyVecEnv([env_maker])
env = VecNormalize(env, norm_obs=True, norm_reward=True)


In [21]:
n_cpu = 8

## define PPO hyperparameters

def optimize_ppo(trial):
    """ Learning hyperparamters we want to optimise"""
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])
    net_arch = { "small": [dict(pi=[64, 64], vf=[64, 64])], "medium": [dict(pi=[256, 256], vf=[256, 256])],}[net_arch]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
    ortho_init = False
    return {
         'gamma': trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]),
         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.),
          'n_steps': trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
           'n_epochs': trial.suggest_categorical("n_epochs", [1, 5, 10, 20]),
            'batch_size': trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512]),
            'clip_range': trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4]),
            'vf_coef': trial.suggest_uniform("vf_coef", 0, 1),
            'ent_coef': trial.suggest_loguniform("ent_coef", 0.00000001, 0.1),
            'max_grad_norm': trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]),
            'gae_lambda': trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]),
            "policy_kwargs": dict(net_arch=net_arch,activation_fn=activation_fn, ortho_init=ortho_init,)
         }

# define optimization agent for Optuna

def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_ppo(trial)
    #env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])
    #env = gym.make('CartPole-v1')
    #env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

    model = PPO('MlpPolicy', env, verbose=0, **model_params, seed = 42)
    model.learn(1000)
    model.save('PPO_{}.pth'.format(trial.number))

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    while n_episodes < 1:
        
        action, _ = model.predict(obs, deterministic = True)
        obs, reward, done, info = env.step(action)
        #print("Observation:", obs)
        #print("Action:", action)
        #print("Reward:", reward)
        #print(type(info[0]["total_profit"]))
        #print("Info:Total Profit: ", info[0]["total_profit"])
        #reward_sum += reward
        #print(reward)
        #print(reward_sum)

        if done:
            rewards = (info[0]["total_profit"]-1)*100
            print(info)
            #print(rewards)
            n_episodes += 1
            obs = env.reset()
    #env.render()
    print(rewards)
    last_reward = rewards
    #trial.report(-1 * last_reward)
    return last_reward


In [22]:
# set an optuna sampler, TPE is the best
sampler = optuna.samplers.TPESampler(seed=42)

# setup study environment for Optuna
study = optuna.create_study(study_name="PPO_study",direction='maximize',
                            sampler = sampler, pruner=optuna.pruners.HyperbandPruner())

# setup early callback
logging_callback = LoggingCallback(threshold=1e-5,patience=30,trial_number=5)


[32m[I 2022-03-07 06:26:39,175][0m A new study created in memory with name: PPO_study[0m


In [23]:
# run optuna for ten trials
study.optimize(optimize_agent, n_trials=100,catch=(ValueError,),callbacks=[logging_callback])


[32m[I 2022-03-07 06:27:26,564][0m Trial 0 finished with value: -68.33952322928909 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.99, 'learning_rate': 0.7072114131472232, 'n_steps': 8, 'n_epochs': 20, 'batch_size': 8, 'clip_range': 0.3, 'vf_coef': 0.3046137691733707, 'ent_coef': 4.827305651975693e-08, 'max_grad_norm': 0.9, 'gae_lambda': 0.95}. Best is trial 0 with value: -68.33952322928909.[0m


[{'total_reward': 25.944869995117188, 'total_profit': 0.31660476770710916, 'position': 1, 'terminal_observation': array([[1.945664 , 1.1041499]], dtype=float32)}]
-68.33952322928909


[32m[I 2022-03-07 06:27:35,308][0m Trial 1 finished with value: -95.77865029403108 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.005169879442298541, 'n_steps': 64, 'n_epochs': 5, 'batch_size': 16, 'clip_range': 0.3, 'vf_coef': 0.1195942459383017, 'ent_coef': 0.0009833622008382904, 'max_grad_norm': 0.6, 'gae_lambda': 0.95}. Best is trial 0 with value: -68.33952322928909.[0m


[{'total_reward': 99.90126037597656, 'total_profit': 0.04221349705968921, 'position': 1, 'terminal_observation': array([[1.945717 , 1.1041933]], dtype=float32)}]
-95.77865029403108


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=16 and n_envs=1)
[32m[I 2022-03-07 06:27:40,557][0m Trial 2 finished with value: -96.69935394242083 and parameters: {'net_arch': 'small', 'activation_fn': 'tanh', 'gamma': 0.9, 'learning_rate': 0.004974062174968407, 'n_steps': 16, 'n_epochs': 1, 'batch_size': 512, 'clip_range': 0.1, 'vf_coef': 0.2848404943774676, 'ent_coef': 1.8122104544785752e-08, 'max_grad_norm': 5, 'gae_lambda': 0.92}. Best is trial 0 with value: -68.33952322928909.[0m


[{'total_reward': 99.75746154785156, 'total_profit': 0.033006460575791687, 'position': 0, 'terminal_observation': array([[1.9461172, 1.1042067]], dtype=float32)}]
-96.69935394242083


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=128 and n_envs=1)
[32m[I 2022-03-07 06:27:45,191][0m Trial 3 finished with value: -8.336677629897359 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.00013566845397879244, 'n_steps': 128, 'n_epochs': 1, 'batch_size': 256, 'clip_range': 0.4, 'vf_coef': 0.8870864242651173, 'ent_coef': 0.002878252039052746, 'max_grad_norm': 0.7, 'gae_lambda': 0.99}. Best is trial 3 with value: -8.336677629897359.[0m


[{'total_reward': -6.6313323974609375, 'total_profit': 0.9166332237010264, 'position': 0, 'terminal_observation': array([[1.9461648, 1.1042486]], dtype=float32)}]
-8.336677629897359


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8 and n_envs=1)
[32m[I 2022-03-07 06:28:29,341][0m Trial 4 finished with value: -94.30500710620305 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.0009236099934536367, 'n_steps': 8, 'n_epochs': 20, 'batch_size': 512, 'clip_range': 0.1, 'vf_coef': 0.8511366715168569, 'ent_coef': 1.6536897559229541e-06, 'max_grad_norm': 2, 'gae_lambda': 0.9}. Best is trial 3 with value: -8.336677629897359.[0m


[{'total_reward': 74.55955505371094, 'total_profit': 0.056949928937969546, 'position': 1, 'terminal_observation': array([[1.9467695, 1.1042706]], dtype=float32)}]
-94.30500710620305


[32m[I 2022-03-07 06:28:38,826][0m Trial 5 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.0004897515106470033, 'n_steps': 1024, 'n_epochs': 5, 'batch_size': 16, 'clip_range': 0.4, 'vf_coef': 0.5163003483011953, 'ent_coef': 1.822616554559885e-06, 'max_grad_norm': 0.9, 'gae_lambda': 0.98}. Best is trial 5 with value: 0.0.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9468102, 1.1043113]], dtype=float32)}]
0.0


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8 and n_envs=1)
[32m[I 2022-03-07 06:28:49,686][0m Trial 6 finished with value: -62.668072910900044 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.99, 'learning_rate': 1.5982474183666166e-05, 'n_steps': 8, 'n_epochs': 5, 'batch_size': 32, 'clip_range': 0.3, 'vf_coef': 0.3881699262065219, 'ent_coef': 0.0003184326045652435, 'max_grad_norm': 0.8, 'gae_lambda': 0.99}. Best is trial 5 with value: 0.0.[0m


[{'total_reward': -12.587234497070312, 'total_profit': 0.3733192708909996, 'position': 0, 'terminal_observation': array([[1.9473984, 1.1043322]], dtype=float32)}]
-62.668072910900044


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8 and n_envs=1)
[32m[I 2022-03-07 06:29:06,553][0m Trial 7 finished with value: 0.0 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.95, 'learning_rate': 0.10818665579547869, 'n_steps': 8, 'n_epochs': 10, 'batch_size': 256, 'clip_range': 0.3, 'vf_coef': 0.6318372121697993, 'ent_coef': 1.3775857884996336e-05, 'max_grad_norm': 0.8, 'gae_lambda': 0.92}. Best is trial 5 with value: 0.0.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9479764, 1.1043528]], dtype=float32)}]
0.0


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=32 and n_envs=1)
[32m[I 2022-03-07 06:29:11,794][0m Trial 8 finished with value: 0.0 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.0007131990160433632, 'n_steps': 32, 'n_epochs': 1, 'batch_size': 256, 'clip_range': 0.1, 'vf_coef': 0.22359583851945264, 'ent_coef': 0.055278694270483, 'max_grad_norm': 0.9, 'gae_lambda': 1.0}. Best is trial 5 with value: 0.0.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9480052, 1.1043917]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:29:20,994][0m Trial 9 finished with value: -70.27508580816706 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.95, 'learning_rate': 2.4472209666467123e-05, 'n_steps': 16, 'n_epochs': 5, 'batch_size': 16, 'clip_range': 0.1, 'vf_coef': 0.9699143978146032, 'ent_coef': 0.007849327721966518, 'max_grad_norm': 2, 'gae_lambda': 0.92}. Best is trial 5 with value: 0.0.[0m


[{'total_reward': -44.647064208984375, 'total_profit': 0.29724914191832946, 'position': 1, 'terminal_observation': array([[1.9483612, 1.1044024]], dtype=float32)}]
-70.27508580816706


[32m[I 2022-03-07 06:29:27,718][0m Trial 10 finished with value: -78.74798563819054 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.9999, 'learning_rate': 0.022716651949429716, 'n_steps': 1024, 'n_epochs': 10, 'batch_size': 128, 'clip_range': 0.2, 'vf_coef': 0.6268836449065676, 'ent_coef': 4.573945793542014e-07, 'max_grad_norm': 1, 'gae_lambda': 0.98}. Best is trial 5 with value: 0.0.[0m


[{'total_reward': 150.04478454589844, 'total_profit': 0.21252014361809451, 'position': 1, 'terminal_observation': array([[1.9483861, 1.1044401]], dtype=float32)}]
-78.74798563819054


[32m[I 2022-03-07 06:29:38,556][0m Trial 11 finished with value: 0.0 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.95, 'learning_rate': 0.16605541619992314, 'n_steps': 256, 'n_epochs': 10, 'batch_size': 16, 'clip_range': 0.4, 'vf_coef': 0.6089301093467249, 'ent_coef': 9.659744481588579e-06, 'max_grad_norm': 0.5, 'gae_lambda': 0.98}. Best is trial 5 with value: 0.0.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9484105, 1.1044772]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:29:51,548][0m Trial 12 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.98, 'learning_rate': 0.03629852778670356, 'n_steps': 2048, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.542053462327127, 'ent_coef': 3.959248529689912e-05, 'max_grad_norm': 0.8, 'gae_lambda': 0.8}. Best is trial 5 with value: 0.0.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9484829, 1.1044983]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:29:56,609][0m Trial 13 finished with value: -80.91636151157564 and parameters: {'net_arch': 'small', 'activation_fn': 'relu', 'gamma': 0.95, 'learning_rate': 0.00021105599979314102, 'n_steps': 1024, 'n_epochs': 5, 'batch_size': 256, 'clip_range': 0.2, 'vf_coef': 0.7240627860494586, 'ent_coef': 3.26105829923321e-05, 'max_grad_norm': 0.3, 'gae_lambda': 0.92}. Best is trial 5 with value: 0.0.[0m


[{'total_reward': 66.16636657714844, 'total_profit': 0.1908363848842436, 'position': 1, 'terminal_observation': array([[1.948506, 1.104534]], dtype=float32)}]
-80.91636151157564


[32m[I 2022-03-07 06:30:04,462][0m Trial 14 finished with value: 73.59469794835299 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.02985542629098708, 'n_steps': 512, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.45967128174320143, 'ent_coef': 4.242212297923568e-07, 'max_grad_norm': 0.8, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.7359469794835298, 'position': 1, 'terminal_observation': array([[1.9485286, 1.1045692]], dtype=float32)}]
73.59469794835299


[32m[I 2022-03-07 06:30:11,490][0m Trial 15 finished with value: 73.59469794835299 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.018064469462475835, 'n_steps': 512, 'n_epochs': 5, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.4360998335722802, 'ent_coef': 3.630435477831549e-07, 'max_grad_norm': 0.9, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.7359469794835298, 'position': 1, 'terminal_observation': array([[1.9485509, 1.1046038]], dtype=float32)}]
73.59469794835299


[32m[I 2022-03-07 06:30:20,411][0m Trial 16 finished with value: 73.59469794835299 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.016198292710606937, 'n_steps': 512, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.4167930794026621, 'ent_coef': 1.0347851772445623e-07, 'max_grad_norm': 0.3, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.7359469794835298, 'position': 1, 'terminal_observation': array([[1.9485729, 1.1046379]], dtype=float32)}]
73.59469794835299


[32m[I 2022-03-07 06:30:28,069][0m Trial 17 finished with value: 73.59469794835299 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.6388652168436925, 'n_steps': 512, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.10191865141240852, 'ent_coef': 9.17618647312694e-08, 'max_grad_norm': 0.3, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.7359469794835298, 'position': 1, 'terminal_observation': array([[1.9485945, 1.1046712]], dtype=float32)}]
73.59469794835299


[32m[I 2022-03-07 06:30:37,741][0m Trial 18 finished with value: -92.65194688787486 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.9, 'learning_rate': 0.5435984496143086, 'n_steps': 512, 'n_epochs': 20, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.022229343900321746, 'ent_coef': 1.1043693768086409e-08, 'max_grad_norm': 0.3, 'gae_lambda': 0.8}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 92.48072814941406, 'total_profit': 0.07348053112125141, 'position': 0, 'terminal_observation': array([[1.9486157, 1.1047041]], dtype=float32)}]
-92.65194688787486


[32m[I 2022-03-07 06:30:44,191][0m Trial 19 finished with value: -65.00073829727175 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.999, 'learning_rate': 0.007923388950115623, 'n_steps': 512, 'n_epochs': 5, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.3981599677135844, 'ent_coef': 1.2140502431479977e-07, 'max_grad_norm': 0.6, 'gae_lambda': 0.9}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': -8.847503662109375, 'total_profit': 0.34999261702728246, 'position': 1, 'terminal_observation': array([[1.9486365, 1.1047364]], dtype=float32)}]
-65.00073829727175


[32m[I 2022-03-07 06:31:08,153][0m Trial 20 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.98, 'learning_rate': 0.12251081913489154, 'n_steps': 512, 'n_epochs': 10, 'batch_size': 8, 'clip_range': 0.2, 'vf_coef': 0.14568467020352482, 'ent_coef': 7.893474837998366e-08, 'max_grad_norm': 0.3, 'gae_lambda': 1.0}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.948657 , 1.1047683]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:31:15,812][0m Trial 21 finished with value: -67.90892223171538 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.0016590803917197602, 'n_steps': 512, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.0023576827932617617, 'ent_coef': 3.9011825062254985e-07, 'max_grad_norm': 0.3, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 33.31388854980469, 'total_profit': 0.32091077768284626, 'position': 1, 'terminal_observation': array([[1.9486773, 1.1047996]], dtype=float32)}]
-67.90892223171538


[32m[I 2022-03-07 06:31:24,786][0m Trial 22 finished with value: 73.59469794835299 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.016337683570929507, 'n_steps': 512, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.35776043590341927, 'ent_coef': 2.7203755254739597e-06, 'max_grad_norm': 0.3, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.7359469794835298, 'position': 1, 'terminal_observation': array([[1.9486971, 1.1048304]], dtype=float32)}]
73.59469794835299


[32m[I 2022-03-07 06:31:32,224][0m Trial 23 finished with value: 73.59469794835299 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.23692307973820131, 'n_steps': 512, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.1770351516204024, 'ent_coef': 3.6262016550408997e-06, 'max_grad_norm': 0.3, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.7359469794835298, 'position': 1, 'terminal_observation': array([[1.9487166, 1.1048607]], dtype=float32)}]
73.59469794835299


[32m[I 2022-03-07 06:31:38,600][0m Trial 24 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.28523396620868585, 'n_steps': 128, 'n_epochs': 10, 'batch_size': 128, 'clip_range': 0.4, 'vf_coef': 0.16871355577549824, 'ent_coef': 0.00011953381573004873, 'max_grad_norm': 0.3, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.948736 , 1.1048906]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:31:48,525][0m Trial 25 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.04863410605632118, 'n_steps': 32, 'n_epochs': 10, 'batch_size': 32, 'clip_range': 0.4, 'vf_coef': 0.261044357894621, 'ent_coef': 4.545848873621501e-08, 'max_grad_norm': 0.3, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9487549, 1.1049199]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:31:56,115][0m Trial 26 finished with value: -36.3293634711159 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.9999, 'learning_rate': 0.9828413906493645, 'n_steps': 64, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.0734382508662927, 'ent_coef': 2.239282546454522e-06, 'max_grad_norm': 1, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 208.90902709960938, 'total_profit': 0.6367063652888411, 'position': 0, 'terminal_observation': array([[1.9487735, 1.1049489]], dtype=float32)}]
-36.3293634711159


[32m[I 2022-03-07 06:32:08,337][0m Trial 27 finished with value: 73.59469794835299 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.9999, 'learning_rate': 0.06297023204700801, 'n_steps': 2048, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.48225126391254913, 'ent_coef': 6.667273063320414e-07, 'max_grad_norm': 5, 'gae_lambda': 0.98}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.7359469794835298, 'position': 1, 'terminal_observation': array([[1.9488332, 1.1049621]], dtype=float32)}]
73.59469794835299


[32m[I 2022-03-07 06:32:20,043][0m Trial 28 finished with value: 73.59469794835299 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.99, 'learning_rate': 0.0706581208639715, 'n_steps': 2048, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.2, 'vf_coef': 0.7089610050789534, 'ent_coef': 1.6392711388960482e-07, 'max_grad_norm': 5, 'gae_lambda': 0.95}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.7359469794835298, 'position': 1, 'terminal_observation': array([[1.9488916, 1.104975 ]], dtype=float32)}]
73.59469794835299


[32m[I 2022-03-07 06:33:02,369][0m Trial 29 finished with value: 2.488647108404307 and parameters: {'net_arch': 'medium', 'activation_fn': 'tanh', 'gamma': 0.9, 'learning_rate': 0.31227885207164496, 'n_steps': 256, 'n_epochs': 20, 'batch_size': 8, 'clip_range': 0.3, 'vf_coef': 0.3279542071404764, 'ent_coef': 5.243905972480589e-06, 'max_grad_norm': 0.5, 'gae_lambda': 0.8}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 13.039413452148438, 'total_profit': 1.024886471084043, 'position': 0, 'terminal_observation': array([[1.9489087, 1.1050026]], dtype=float32)}]
2.488647108404307


[32m[I 2022-03-07 06:33:14,487][0m Trial 30 finished with value: 73.59469794835299 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.2107865858330359, 'n_steps': 2048, 'n_epochs': 10, 'batch_size': 64, 'clip_range': 0.4, 'vf_coef': 0.4661278352418101, 'ent_coef': 6.683635875420621e-07, 'max_grad_norm': 5, 'gae_lambda': 0.99}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.7359469794835298, 'position': 1, 'terminal_observation': array([[1.9489652, 1.1050149]], dtype=float32)}]
73.59469794835299


[32m[I 2022-03-07 06:33:24,365][0m Trial 31 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.99, 'learning_rate': 0.07122324668699942, 'n_steps': 2048, 'n_epochs': 5, 'batch_size': 64, 'clip_range': 0.2, 'vf_coef': 0.7593315332327361, 'ent_coef': 2.4429268907339624e-07, 'max_grad_norm': 5, 'gae_lambda': 0.95}. Best is trial 14 with value: 73.59469794835299.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9490204, 1.1050268]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:33:32,675][0m Trial 32 finished with value: 74.32916713943986 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.99, 'learning_rate': 0.011294044865017215, 'n_steps': 2048, 'n_epochs': 1, 'batch_size': 64, 'clip_range': 0.2, 'vf_coef': 0.7017323677464816, 'ent_coef': 9.827840340332135e-07, 'max_grad_norm': 5, 'gae_lambda': 0.95}. Best is trial 32 with value: 74.32916713943986.[0m


[{'total_reward': 164.70816040039062, 'total_profit': 1.7432916713943987, 'position': 1, 'terminal_observation': array([[1.9490744, 1.1050386]], dtype=float32)}]
74.32916713943986


[32m[I 2022-03-07 06:33:40,112][0m Trial 33 finished with value: -67.2046938934364 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.99, 'learning_rate': 0.003971722299503035, 'n_steps': 2048, 'n_epochs': 1, 'batch_size': 512, 'clip_range': 0.2, 'vf_coef': 0.7240065768658887, 'ent_coef': 3.6017298237154104e-08, 'max_grad_norm': 5, 'gae_lambda': 0.95}. Best is trial 32 with value: 74.32916713943986.[0m


[{'total_reward': 105.43492126464844, 'total_profit': 0.32795306106563604, 'position': 0, 'terminal_observation': array([[1.9491272, 1.1050501]], dtype=float32)}]
-67.2046938934364


[32m[I 2022-03-07 06:33:45,047][0m Trial 34 finished with value: -82.11575911420827 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.99, 'learning_rate': 0.009116568337682042, 'n_steps': 512, 'n_epochs': 1, 'batch_size': 64, 'clip_range': 0.2, 'vf_coef': 0.5804493781052737, 'ent_coef': 3.232533146076215e-08, 'max_grad_norm': 0.9, 'gae_lambda': 0.95}. Best is trial 32 with value: 74.32916713943986.[0m


[{'total_reward': 5.5502777099609375, 'total_profit': 0.17884240885791722, 'position': 0, 'terminal_observation': array([[1.9491417, 1.1050758]], dtype=float32)}]
-82.11575911420827


[32m[I 2022-03-07 06:33:50,114][0m Trial 35 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.5011363353675061, 'n_steps': 64, 'n_epochs': 1, 'batch_size': 32, 'clip_range': 0.1, 'vf_coef': 0.20415225832191877, 'ent_coef': 6.740185280747185e-06, 'max_grad_norm': 0.3, 'gae_lambda': 0.95}. Best is trial 32 with value: 74.32916713943986.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.949156 , 1.1051012]], dtype=float32)}]
0.0


[32m[I 2022-03-07 06:33:57,536][0m Trial 36 finished with value: 0.0 and parameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.995, 'learning_rate': 0.4990762895828212, 'n_steps': 2048, 'n_epochs': 1, 'batch_size': 128, 'clip_range': 0.2, 'vf_coef': 0.8372315492125351, 'ent_coef': 9.556612216206681e-07, 'max_grad_norm': 5, 'gae_lambda': 0.99}. Best is trial 32 with value: 74.32916713943986.[0m


[{'total_reward': 0.0, 'total_profit': 1.0, 'position': 0, 'terminal_observation': array([[1.9492068, 1.1051117]], dtype=float32)}]
0.0
The study stops now...
With number 36 and value  0.0
The previous and current best values are 74.32916713943986 and 74.32916713943986 respectively


In [24]:
# record and best parameters
trial = study.best_trial
print(study.best_trial.number)
print('Reward: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))
#joblib.dump(study, "final_ddpg_study__.pkl")

32
Reward: 74.32916713943986
Best hyperparameters: {'net_arch': 'medium', 'activation_fn': 'relu', 'gamma': 0.99, 'learning_rate': 0.011294044865017215, 'n_steps': 2048, 'n_epochs': 1, 'batch_size': 64, 'clip_range': 0.2, 'vf_coef': 0.7017323677464816, 'ent_coef': 9.827840340332135e-07, 'max_grad_norm': 5, 'gae_lambda': 0.95}


In [25]:
window_size = 1
start_index = window_size
end_index = df.shape[0]
env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)
env_maker = lambda: env
env = DummyVecEnv([env_maker])
env = VecNormalize(env, norm_obs=True, norm_reward=True)


In [32]:
#now use the best model and check your score
#env = gym.make('CartPole-v1')
#env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

#use saved model: study.best_trial.number
#print(study.best_trial.number)
tuned_model_PPO = PPO.load('PPO_{}.pth'.format(study.best_trial.number),env=env)

rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 1:
    #env.render()
    action, _ = tuned_model_PPO.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    if(action[0])==0:
        print(obs)
        print("SELL")
    #reward_sum += reward
    #print(action)

    if done:
        rewards = (info[0]["total_profit"]-1)*100
        n_episodes += 1
        obs = env.reset()


env.close()
last_reward = rewards
#trial.report(-1 * last_reward)
print(last_reward)


[[[-1.2148602  0.4212632]]]
SELL
[[[-0.42750472 -2.3082156 ]]]
SELL
[[[-0.39673233 -2.7032747 ]]]
SELL
[[[-0.78618604 -3.6354692 ]]]
SELL
[[[-1.2644706 -7.559255 ]]]
SELL
[[[-1.262366 -3.311979]]]
SELL
[[[-1.1480645  0.8813498]]]
SELL
[[[-1.0556145 -2.0443993]]]
SELL
[[[-0.8922935   0.01791388]]]
SELL
[[[-0.51798254 -0.8274086 ]]]
SELL
[[[ 0.86923254 -0.84000885]]]
SELL
74.32920826957432


In [27]:
# Now train your own model without tuning
#env = gym.make('CartPole-v1')
#env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

#study.best_trial.number
#tuned_model_PPO = PPO.load('PPO_{}.pth'.format(study.best_trial.number),env=env)
nontuned_model_PPO = PPO('MlpPolicy', env, verbose=0)
nontuned_model_PPO.learn(1000)
    


<stable_baselines3.ppo.ppo.PPO at 0x164be5f2640>

In [28]:
# compare the scores
rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 1:
    
    action, _ = nontuned_model_PPO.predict(obs, deterministic=True)
    #print(action)
    obs, reward, done, info = env.step(action)
    #reward_sum += reward

    if done:
        rewards = (info[0]["total_profit"]-1)*100
        n_episodes += 1
        obs = env.reset()


env.close()
last_reward = rewards
#trial.report(-1 * last_reward)
print(last_reward)

-74.18015927060861


In [29]:
# check score for random sample...
rewards = []
n_episodes, reward_sum = 0, 0.0
#env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

obs = env.reset()
#env.render()
while n_episodes < 1:
    #action, _ = env.action_space.sample()
    #action, _ = tuned_model_PPO.predict(env.action_space.sample())
    action = env.action_space.sample()
    #print(action)
    obs, reward, done, info = env.step([action])
    #reward_sum += reward

    if done:
        rewards = (info[0]["total_profit"]-1)*100
        n_episodes += 1
        obs = env.reset()


env.close()
last_reward = rewards
#trial.report(-1 * last_reward)
print(last_reward)


-97.19096131520675


In [30]:
#Now build your own model from the parameters and run it
#env = gym.make('CartPole-v1')
#env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

trial = study.best_trial
#print(trial.number)
model_params = optimize_ppo(trial)
best_trial_parameters_model = PPO('MlpPolicy', env, verbose=0, **model_params)
best_trial_parameters_model.learn(1000)
rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 1:
    #env.render()
    action, _ = best_trial_parameters_model.predict(obs, deterministic = True)
    #print(action)
    obs, reward, done, info = env.step(action)
    #reward_sum += reward

    if done:
        rewards = (info[0]["total_profit"]-1)*100
        n_episodes += 1
        obs = env.reset()


env.close()
last_reward = rewards
#trial.report(-1 * last_reward)
print(last_reward)

-74.3899732578935


In [33]:
#Now build your own model from the parameters and run it for longer time
#env = gym.make('CartPole-v1')
#env = gym.make('stocks-v0', df=df, frame_bound=(start_index,end_index), window_size=window_size)

trial = study.best_trial
#print(trial.number)
model_params = optimize_ppo(trial)
best_trial_parameters_model = PPO('MlpPolicy', env, verbose=0, **model_params)
best_trial_parameters_model.learn(10000)
rewards = []
n_episodes, reward_sum = 0, 0.0

obs = env.reset()
#env.render()
while n_episodes < 1:
    #env.render()
    action, _ = best_trial_parameters_model.predict(obs, deterministic=True)
      
    obs, reward, done, info = env.step(action)
    #print(action)
    #reward_sum += reward

    if done:
        rewards = (info[0]["total_profit"]-1)*100
        n_episodes += 1
        obs = env.reset()


env.close()
last_reward = rewards
#trial.report(-1 * last_reward)
print(last_reward)

11.982803354415728
