# Train Model

This class is used to train the models. It makes all the necessary setup (environment configs, environement setup, hyperparameters setup) for training.
Models can be trained through specification of hyperparameters, or through random search.

In [1]:
%load_ext autoreload
%autoreload 2
import logging
import gymnasium as gym
from typing import List
from environment import MarketEnv
import constants
import json, glob, os, pickle, datetime
from visualizer import Visualizer
from util import get_datasets, get_train_validate_test_datasets
import os 
from pytorch_lightning import loggers
import ppo
from constants import *
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from environment import Config
import torch
import numpy as np

logging.basicConfig(level=logging.INFO)
torch.set_float32_matmul_precision('medium')

# Dataset

In [2]:
# Load the datasets
datasets = get_datasets()
train_sets, validation_sets, test_sets = get_train_validate_test_datasets(datasets)

# Environment

In [3]:
env_configs = Config(
    MEAN_RUN_DURATION = 100,
    STD_RUN_DURATION = 10,
    START_BALANCE = 1000,
    MAX_BUY_LIMIT = 10,
    CONTINUOUS_MODEL = False,
    TRUNCATION_PENALTY = 0,
    STOCK_HOLDING_REWARD=0,
    RESTING_PENALTY=1,
    RESTING_PENALTY_START=3
)

In [4]:
gym.register("MarketEnv-v0", entry_point=MarketEnv)
env = gym.make("MarketEnv-v0", datasets=train_sets, config=env_configs)

# Model Training

In [5]:
hyperparameters = {
    'gamma': 0.8277782036668777,
    'lam': 0.8425986928387932,
    'lr_actor': 0.0002484147755421103, 
    'lr_critic': 0.001950332927924421, 
    'max_episode_len': 1000, 
    'batch_size': 64, 
    'steps_per_epoch': 2048, 
    'nb_optim_iters': 16, 
    'clip_ratio': 0.13837575923666506, 
    'rec_hidden_size': 128, 
    'fc_hidden_sizes': [16, 16, 16], 
    'rec_num_layers': 2, 
    'rec_nonlinearity': 'tanh', 
    'fc_nonlinearity': 'tanh', 
    'rnn_type': 'LSTM', 
    'dropout': 0.01
}

In [6]:
def train_model(env, env_configs, hyperparameters, epochs=100):
    # Define the model
    model = ppo.PPO(env=env, config=env_configs, **hyperparameters)
    LOGDIR = os.path.join("..", "models", "generated")

    # Set up tesnorboard logger
    tb_logger = loggers.TensorBoardLogger(LOGDIR)


    # Define the checkpoint callback for highest average reward
    checkpoint_callback = ModelCheckpoint(
        monitor='avg_reward',
        dirpath=tb_logger.log_dir,
        filename='model-{epoch:02d}-{avg_reward:.5f}',
        save_top_k=5,
        mode='max',
    )

    # Define the trainer
    trainer = Trainer(max_epochs=epochs, accelerator="gpu", logger=tb_logger, callbacks=[checkpoint_callback])

    # Train the model
    trainer.fit(model)

In [7]:
train_model(env, env_configs, hyperparameters, epochs=1000)

  logger.warn(f"{pre} is not within the observation space.")
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory C:\Users\Daniel\OneDrive - TU Wien\Uni\7. Semester\ADL\repo\adl\models\generated\lightning_logs\version_69 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type             | Params | Mode 
----------------------------------------------------
0 | critic | RNNModel         | 203 K  | train
1 | actor  | ActorCategorical | 203 K  | train
----------------------------------------------------
407 K     Trainable params
0         Non-trainable params
407 K     Total params
1.629     Total estimated model params size (MB)
21        Modules in train mode
0         Modules in eval mode
c:\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_d

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1000` reached.


# Randomized search

In [7]:
def get_hyperparameters(gamma: float, lam: float, lr_actor: float, lr_critic: float, clip_ratio: float, rec_hidden_size: int, fc_hidden_sizes: List[int], rec_num_layers: int, rec_nonlinearity: str, fc_nonlinearity: str, rnn_type: str, dropout: float):
    return {
        'gamma': gamma,
        'lam': lam,
        'lr_actor': lr_actor,
        'lr_critic': lr_critic,
        'max_episode_len': 1000,
        'batch_size': 64,
        'steps_per_epoch': 2048,
        'nb_optim_iters': 16,
        'clip_ratio': clip_ratio,
        'rec_hidden_size': rec_hidden_size,
        'fc_hidden_sizes': fc_hidden_sizes,
        'rec_num_layers': rec_num_layers,
        'rec_nonlinearity': rec_nonlinearity,
        'fc_nonlinearity': fc_nonlinearity,
        'rnn_type': rnn_type,
        'dropout': dropout
    }


In [None]:
while True:
    try:
        hyperparameters = get_hyperparameters(
            gamma = np.random.uniform(0.7, 0.99),
            lam = np.random.uniform(0.7, 0.99),
            lr_actor = np.random.uniform(0.0001, 0.001),
            lr_critic = np.random.uniform(0.0005, 0.005),
            clip_ratio = np.random.uniform(0.1, 0.3),
            rec_hidden_size = 2**np.random.randint(4, 9),
            fc_hidden_sizes = [2**np.random.randint(4, 9) for _ in range(np.random.randint(1, 4))],
            rec_num_layers = np.random.randint(1, 4),
            rec_nonlinearity = np.random.choice(["tanh", "relu"]),
            fc_nonlinearity = np.random.choice(["tanh", "relu", "sigmoid"]),
            rnn_type = np.random.choice(["GRU", "LSTM", "RNN"]),
            dropout = max(0, np.random.uniform(-0.7, 0.7))
        )
        env_configs = Config(
            MEAN_RUN_DURATION = 100,
            STD_RUN_DURATION = 10,
            START_BALANCE = 1000,
            MAX_BUY_LIMIT = 10,
            CONTINUOUS_MODEL = np.random.choice([False, True]),
            TRUNCATION_PENALTY = np.random.choice([0, 0.1, 0.5, 1, 5, 10], p=[0.5, 0.1, 0.1, 0.1, 0.1, 0.1]),
            STOCK_HOLDING_REWARD=np.random.choice([0, 0.1, 0.5, 1, 5, 10], p=[0.1, 0.1, 0.1, 0.5, 0.1, 0.1])
        )
        env = gym.make("MarketEnv-v0", datasets=train_sets, config=env_configs)
        train_model(env, env_configs, hyperparameters)
    except Exception as e:
        print("ERROR WITH", hyperparameters, env_configs)
        print(e)
        continue


# Further Training of Selected Models

The best selected models for further training were versions 10, 40, 41 and 17. From each model, the hyperparameters are extracted and a new model is trained with 1000 instead of 100 epochs.

In [6]:
# Select 5 most promising models
selected_models = ["10", "40", "41", "17"]

In [7]:
def extract_hyperparameters(model):
    env_configs = model.config

    hyperparameters = {
        'gamma': model.gamma,
        'lam': model.lam,
        'lr_actor': model.lr_actor, 
        'lr_critic': model.lr_critic, 
        'max_episode_len': model.max_episode_len, 
        'batch_size': model.batch_size, 
        'steps_per_epoch': model.steps_per_epoch, 
        'nb_optim_iters': model.nb_optim_iters, 
        'clip_ratio': model.clip_ratio, 
        'rec_hidden_size': model.rec_hidden_size, 
        'fc_hidden_sizes': model.fc_hidden_sizes, 
        'rec_num_layers': model.rec_num_layers, 
        'rec_nonlinearity': model.rec_nonlinearity, 
        'fc_nonlinearity': model.fc_nonlinearity,
        'rnn_type': model.rnn_type, 
        'dropout': model.dropout
    }
    return env_configs, hyperparameters

In [8]:
def get_model_path(model_version):
    print(os.path.join(MODEL_PATH, "generated", "lightning_logs", f"version_{model_version}", "*.ckpt"))
    return glob.glob(os.path.join(MODEL_PATH, "generated", "lightning_logs", f"version_{model_version}", "*.ckpt"))[-1]

In [None]:
for model_version in selected_models:
    path = get_model_path(model_version)
    model = ppo.PPO.load_from_checkpoint(path)
    config, hyperparameters = extract_hyperparameters(model)
    config.TRUNCATION_PENALTY = 10
    config.STOCK_HOLDING_REWARD = 1
    env = gym.make("MarketEnv-v0", datasets=train_sets, config=config)
    train_model(env, config, hyperparameters, epochs=1000)