# Train Agents

## Import libraries

In [1]:
from stable_baselines3 import PPO
from gymnasium.envs.registration import register
from stable_baselines3.common.env_util import make_vec_env
import sys 
sys.path.append('/Applications/Studium/Master/Masterarbeit/AADRL/')
from train_agents.eval_callback import EvaluationCallback
from sb3_contrib import RecurrentPPO
import pandas as pd
import gymnasium as gym
from util_functions import sample_recurrent_ppo_params,evaluate_policy
import optuna
from optuna.samplers import RandomSampler
from torch import nn as nn
import copy

## Define Constants & Register the Environment

In [2]:
LOG_DIR                  = "./training_eval" # Where to store the results from training
NUMBER_ENVS              = 4                 # How many environments at the same time
N_STEP_VAL               = 2_500             # Total number of time steps after one wants to evaluate the policy 
N_EPISODE_VAL            = 3                 # Number of episodes to run every N_STEP_VAL steps
TIMESTEPS_LEARNING       = 5e5               # How many timesteps the agent should perform
SEED                     = 42                # To make the results reproducible
PATH_ENV_DATA            = '/Applications/Studium/Master/Masterarbeit/AADRL/data/agent_environment_data.csv' # Path where the final environment data is stored
PATH_TURBULENCE_INDX     = '/Applications/Studium/Master/Masterarbeit/AADRL/data/turbulence_index.csv'       # Where the file containing the turbulence index is stored
STARTING_DATE_VALIDATION = '2014-10-01'      # Date from which the validation dataset should start
STARTING_DATE_TEST       = '2016-01-04'      # Date from which the test dataset should start
NUM_TRAILS_PARAMTUNING   = 50                # How many trials one wants to run for the hyperparameter estimation
TIMESTEPS_TRAIN_TUNING   = 1e5               # Number of timesteps during each trial of hyperparameter estimation (lower than TIMESTEPS_LEARNING due to restricted computational resources) 

In [3]:
# Register the environment as a gym environment
register(
    id="Trading-v4", 
    entry_point="env.multistock_trading_v4:MultiStockTrading"
    )

## Definition of Train, Validation and Test Dataset

In [4]:
# Read the complete data containing the market data, technical indicators and sentiments
agent_environment_data = pd.read_csv(PATH_ENV_DATA, index_col = 0, header = [0,1])

# Read the turbulence index datataframe
turbulence_index = pd.read_csv(PATH_TURBULENCE_INDX, index_col=0)
turbulence_index = turbulence_index.fillna(0) # first turbulence index value is nan

In [5]:
# Definition of the datasets
training_data = agent_environment_data.loc[:agent_environment_data.index[agent_environment_data.index == STARTING_DATE_VALIDATION][0]].iloc[:-1]
validation_data = agent_environment_data.loc[agent_environment_data.index[agent_environment_data.index == STARTING_DATE_VALIDATION][0]:
                                             agent_environment_data.index[agent_environment_data.index == STARTING_DATE_TEST][0]].iloc[:-1]
testing_data = agent_environment_data.loc[agent_environment_data.index[agent_environment_data.index == STARTING_DATE_TEST][0]:]

# Split the turbulence index also into training, validation and testing data
turbulence_training = turbulence_index.iloc[:len(training_data)]
turbulence_validation = turbulence_index.iloc[len(training_data):len(training_data)+len(validation_data)]
turbulence_testing = turbulence_index.iloc[len(training_data)+len(validation_data):]

## Train the Agents

Four agents will be trained in total. The configuration of each agent will be as follows:

| Agent    | Network Architecture | Sentiments Used | Reward Function                           |
|----------|---------------------:|:---------------:|:-----------------------------------------:|
| 1        |        MLP           |       ✗         |  Absolute Portfolio Change                |
| 2        |        MLP           |       ✓         |  Absolute Portfolio Change                |
| 3        |        MLP           |       ✓         |  Portfolio Return  & Sharpe Ratio         |
| 4        |        MLP + LSTM    |       ✓         |  Portfolio Return  & Sharpe Ratio         |

Note that the 4th agent represents the final agent containin all model changes. To find the hyperparameters for which the agent performs the best, a hyperparamter optimization using Optuna will be done. This will only be done for the fourth agent. The other agents will then get the same hyperparameters during training.

### Hyperparameter estimation (using Optuna) regarding the setup of the 4th agent
Note that because the training takes so long, only 100k timesteps will be performed during each training of the hyperparameter optimization (instead of the full 500k that will be used later). Afterwards, the best agent will be trained with 500k steps. 

In [6]:
def optimize_agent(trial):
    """ 
    Run the hyperparameter estimation using Optuna 

    Please note that this method is based on the comment:
    https://github.com/araffin/rl-baselines-zoo/issues/29#issuecomment-646888620
    """
    # Sample the algorithm specific hyperparameters
    model_params = sample_recurrent_ppo_params(trial)
    
    # Sample the window for the sharpe ratio (number of actions, note that after performing w actions, 
    # there are w+1 portfolio values to calculate the the sharpe ratio)
    n_step_sharpe = trial.suggest_categorical("n_step_sharpe", [99,199,299])

    # Define the Training environment
    env = make_vec_env('Trading-v4', 
                        n_envs=4,
                        env_kwargs = {"market_data":training_data, 
                                      "turbulence_index":turbulence_training,
                                      "consider_sentiments":True, 
                                      "n_step_sharpe":n_step_sharpe},
                        seed=SEED)

    # Define the environment to evaluate the agent on (validation dataset will be used)
    eval_env = gym.make(
                        'Trading-v4', 
                        market_data=validation_data, 
                        turbulence_index=turbulence_validation,
                        consider_sentiments=True, 
                        n_step_sharpe=n_step_sharpe,
                        seed=SEED
                        )

    # Define the recurrent ppo model with the sampled hyperparameters
    model = RecurrentPPO(
                    'MlpLstmPolicy', 
                    env, 
                    verbose=0, 
                    seed=SEED,
                    **model_params
                    )
    
    # Define the evaluation callback to evaluate the policy 
    eval_callback_params_tuning = EvaluationCallback(
                                        eval_env,
                                        eval_freq=N_STEP_VAL*NUMBER_ENVS,
                                        n_eval_episodes=N_EPISODE_VAL, 
                                        save_every_nstep=None, # Don't save intermediate agents to save memory
                                        log_path=LOG_DIR + '/fourth_agent_hpt_2',
                                        verbose=0
                                        )

    # Train the agent
    model.learn(TIMESTEPS_TRAIN_TUNING,callback=eval_callback_params_tuning)    

    # Load the model that achieved the highest rewards on the validation dataset
    model = RecurrentPPO.load(
        "/Applications/Studium/Master/Masterarbeit/AADRL/train_agents/training_eval/fourth_agent_hpt_2/best_model.zip", 
        env=eval_env
        )
    
    # Run the policy 5 times over the validation dataset and calculate the mean episodic return
    _, _, _, mean_reward, _ = evaluate_policy(eval_env, model,n_eval_episodes=5,deterministic=False)

    return -mean_reward

In [7]:
study = optuna.create_study(sampler=RandomSampler(seed=SEED))
study.optimize(optimize_agent,n_trials=NUM_TRAILS_PARAMTUNING,show_progress_bar=True)

[I 2024-09-02 16:17:20,233] A new study created in memory with name: no-name-640b807a-164d-471c-a967-8cb7daf96258


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-09-02 16:40:46,307] Trial 0 finished with value: -3.1962892368447253 and parameters: {'batch_size': 128, 'n_steps': 128, 'gamma': 0.99, 'learning_rate': 0.0003, 'clip_range': 0.1, 'n_epochs': 5, 'gae_lambda': 0.99, 'max_grad_norm': 0.3, 'net_arch': 'small', 'lstm_hidden_size': 256, 'n_step_sharpe': 99}. Best is trial 0 with value: -3.1962892368447253.
[I 2024-09-02 16:54:04,063] Trial 1 finished with value: -3.745568366083293 and parameters: {'batch_size': 64, 'n_steps': 128, 'gamma': 0.999, 'learning_rate': 0.0001, 'clip_range': 0.2, 'n_epochs': 15, 'gae_lambda': 0.98, 'max_grad_norm': 0.3, 'net_arch': 'small', 'lstm_hidden_size': 64, 'n_step_sharpe': 299}. Best is trial 1 with value: -3.745568366083293.
[I 2024-09-02 17:48:34,220] Trial 2 finished with value: 1.598771836458777 and parameters: {'batch_size': 128, 'n_steps': 256, 'gamma': 0.99, 'learning_rate': 0.0001, 'clip_range': 0.1, 'n_epochs': 15, 'gae_lambda': 0.95, 'max_grad_norm': 0.3, 'net_arch': 'small', 'lstm_hidden

In [8]:
study.best_params

{'batch_size': 128,
 'n_steps': 128,
 'gamma': 0.999,
 'learning_rate': 0.0005,
 'clip_range': 0.2,
 'n_epochs': 5,
 'gae_lambda': 0.98,
 'max_grad_norm': 0.3,
 'net_arch': 'small',
 'lstm_hidden_size': 128,
 'n_step_sharpe': 299}

In [9]:
# Define dictionary containing the best parameters from the hyperparameter estimation
best_params = {
    'batch_size': study.best_params['batch_size'],
    'n_steps': study.best_params['n_steps'],
    'gamma': study.best_params['gamma'],
    'learning_rate': study.best_params['learning_rate'],
    'clip_range': study.best_params['clip_range'],
    'n_epochs': study.best_params['n_epochs'],
    'gae_lambda': study.best_params['gae_lambda'],
    'max_grad_norm': study.best_params['max_grad_norm'],
    'policy_kwargs': {
        'net_arch': dict(pi=[64, 64], vf=[64, 64]) if study.best_params['net_arch'] == 'small' else dict(pi=[256, 64], vf=[256, 64]),
    }
}

# Additional parameters for RecurrentPPO
recurrent_params = copy.deepcopy(best_params) 
recurrent_params['policy_kwargs'].update({
                                        'lstm_hidden_size': study.best_params['lstm_hidden_size'],
                                    })

N_STEP_SHARPE=study.best_params['n_step_sharpe']

# Create Pandas Dataframe and save the results
best_params_csv = pd.DataFrame.from_dict(study.best_params,orient='index').T
best_params_csv.to_csv(r'/Applications/Studium/Master/Masterarbeit/AADRL/train_agents/training_eval/best_params_hyperparameteropt_2.csv')

### Train the Agents 

Multiple SEEDs will be used to have a better overview on the agents performance

In [10]:
# Define the combinations of whether to consider sentiments and sharpe ratio into reward function 
consider_sentiments = [False, True, True, True]
consider_sharpes = [None, None, N_STEP_SHARPE, N_STEP_SHARPE] # When None, the sharpe ratio will not be considered 

In [11]:
random_seeds = [42, 7, 25, 14]  # Random seeds used for training (chosen arbitrarily but must be fixed to ensure reproducible results)

In [15]:
# Loop over the random seeds and execute the training
for seed in random_seeds:
    # Loop over the different configurations and train the agent
    for agent_num, (consider_sentiment, consider_sharpe) in enumerate(zip(consider_sentiments, consider_sharpes)):

        # Define the training environment
        env = make_vec_env('Trading-v4', 
                            n_envs=4,
                            env_kwargs = {"market_data":training_data, 
                                        "turbulence_index":turbulence_training,
                                        "consider_sentiments":consider_sentiment, 
                                        "n_step_sharpe":consider_sharpe}
                            ) # Seed does not need to be specified (has no effect) -> needs to be specified in model definition

        # Define the environment to evaluate the agent on the validation dataset
        eval_env_val = gym.make(
                            'Trading-v4', 
                            market_data=validation_data, 
                            turbulence_index=turbulence_validation,
                            consider_sentiments=consider_sentiment, 
                            n_step_sharpe=consider_sharpe,
                            )
        
        # Define the agent: Use RecurrentPPO if it is the fourth agent, otherwise PPO
        if agent_num != 3:
            model_agent = PPO(
                'MlpPolicy',
                env,
                seed=seed,
                verbose=0,
                **best_params
            )
        else:
            model_agent = RecurrentPPO(
                'MlpLstmPolicy',
                env,
                seed=seed,
                verbose=0,
                **recurrent_params
            )

        # Define the evaluation callback (to save the best performing agent and saving the agent every n-th step)
        eval_callback_validation = EvaluationCallback(
                                            eval_env_val,
                                            eval_freq=N_STEP_VAL*NUMBER_ENVS,
                                            n_eval_episodes=N_EPISODE_VAL, 
                                            save_every_nstep=True, 
                                            saving_freq=10000,
                                            log_path=LOG_DIR + f'/agent_{agent_num + 1}_seed_{seed}',
                                            verbose=0,
                                            additional_name_eval_file='validation'
                                        )

        # Train the agent
        model_agent.learn(
            total_timesteps=5e5,
            callback=eval_callback_validation,
            progress_bar=False)


In [16]:
# Save the training data
training_data.to_csv('/Applications/Studium/Master/Masterarbeit/AADRL/data/training/market_data.csv')
turbulence_training.to_csv('/Applications/Studium/Master/Masterarbeit/AADRL/data/training/turbulence_index.csv')

# Save the validation data
validation_data.to_csv('/Applications/Studium/Master/Masterarbeit/AADRL/data/validation/market_data.csv')
turbulence_validation.to_csv('/Applications/Studium/Master/Masterarbeit/AADRL/data/validation/turbulence_index.csv')

# Save the test data
testing_data.to_csv('/Applications/Studium/Master/Masterarbeit/AADRL/data/testing/market_data.csv')
turbulence_testing.to_csv('/Applications/Studium/Master/Masterarbeit/AADRL/data/testing/turbulence_index.csv')