# Another Example

### Import external modules

In [1]:
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools

from stable_baselines3 import PPO, A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecMonitor

### Add mbt-gym to path

In [2]:
import sys
sys.path.append("../")

In [None]:
from mbt_gym.agents.BaselineAgents import CarteaJaimungalMmAgent
from mbt_gym.gym.helpers.generate_trajectory import generate_trajectory
from mbt_gym.gym.StableBaselinesTradingEnvironment import StableBaselinesTradingEnvironment
from mbt_gym.gym.TradingEnvironment import TradingEnvironment
from mbt_gym.gym.wrappers import *
from mbt_gym.rewards.RewardFunctions import PnL, CjMmCriterion, RunningInventoryPenalty
from mbt_gym.stochastic_processes.midprice_models import BrownianMotionMidpriceModel, BrownianMotionJumpMidpriceModel
from mbt_gym.stochastic_processes.arrival_models import PoissonArrivalModel, HawkesArrivalModel
from mbt_gym.stochastic_processes.fill_probability_models import ExponentialFillFunction
from mbt_gym.gym.ModelDynamics import LimitOrderModelDynamics

### Create market making environment

In [None]:
terminal_time = 1.0
arrival_rate = 10.0
n_steps = int(10 * terminal_time * arrival_rate)
phi = 0.5
alpha = 0.001

### Get the environment

In [None]:
def get_new_env(num_trajectories:int = 1):
    fill_exponent = 1
    sigma = 0.1
    initial_inventory = (-4,5)
    initial_price = 100
    step_size = 1/n_steps
    timestamps = np.linspace(0, terminal_time, n_steps + 1)
    midprice_model = BrownianMotionJumpMidpriceModel(volatility=sigma, step_size=1/n_steps, jump_size = 1,
                                                 num_trajectories=num_trajectories)
    arrival_model = HawkesArrivalModel(step_size=1/n_steps, 
                                        num_trajectories=num_trajectories)
    fill_probability_model = ExponentialFillFunction(fill_exponent=fill_exponent, 
                                                     step_size=1/n_steps,
                                                     num_trajectories=num_trajectories)
    LOtrader = LimitOrderModelDynamics(midprice_model = midprice_model, arrival_model = arrival_model, 
                                fill_probability_model = fill_probability_model,
                                num_trajectories = num_trajectories)
    reward_function = RunningInventoryPenalty(per_step_inventory_aversion = phi, terminal_inventory_aversion = alpha)
    env_params = dict(terminal_time=terminal_time, 
                      n_steps=n_steps,
                      initial_inventory = initial_inventory,
                      model_dynamics = LOtrader,
                      max_inventory=n_steps,
                      normalise_action_space = False,
                      normalise_observation_space = False,
                      reward_function = reward_function,
                      num_trajectories=num_trajectories)
    return TradingEnvironment(**env_params)

In [None]:
def get_new_env(
    drift=0.0, 
    volatility=0.1, 
    jump_size=1.0, 
    initial_price=100, 
    step_size=1/n_steps, 
    num_trajectories=1, 
    initial_inventory = (-4,5),
    inventory_exponent=2.0, 
    baseline_arrival_rate=np.array([[10.0, 10.0]]), 
    arrival_jump_size=40.0, 
    mean_reversion_speed=60.0,
    timestamps = np.linspace(0, terminal_time, n_steps + 1)
):
    n_steps = int(terminal_time / step_size)
    
    # Simulate midprice using Brownian motion with jumps
    midprice_model = BrownianMotionJumpMidpriceModel(
        drift=drift, 
        volatility=volatility, 
        jump_size=jump_size, 
        initial_price=initial_price, 
        terminal_time=terminal_time, 
        step_size=step_size, 
        num_trajectories=num_trajectories
    )
    
    # Simulate order arrivals using Hawkes process
    arrival_model = HawkesArrivalModel(
        baseline_arrival_rate=baseline_arrival_rate, 
        step_size=step_size, 
        jump_size=arrival_jump_size, 
        mean_reversion_speed=mean_reversion_speed, 
        terminal_time=terminal_time, 
        num_trajectories=num_trajectories
    )
    
    # Simulate fill probability for limit orders
    fill_probability_model = ExponentialFillFunction(
        fill_exponent=1, 
        step_size=step_size, 
        num_trajectories=num_trajectories
    )
    
    # Combine models to create market dynamics
    LOtrader = LimitOrderModelDynamics(
        midprice_model=midprice_model, 
        arrival_model=arrival_model, 
        fill_probability_model=fill_probability_model, 
        num_trajectories=num_trajectories
    )
    
    # Define the reward function based on inventory levels
    reward_function = RunningInventoryPenalty(
        per_step_inventory_aversion=phi, # TO TEST
        terminal_inventory_aversion=alpha, # TO TEST
        inventory_exponent=inventory_exponent # TO TEST
    )
    
    # Create the trading environment
    env_params = dict(
        terminal_time=terminal_time, 
        n_steps=n_steps, 
        initial_inventory=initial_inventory, 
        model_dynamics=LOtrader, 
        max_inventory=100, 
        normalise_action_space=False, # TO CHECK
        normalise_observation_space=False, # TO CHECK
        reward_function=reward_function, 
        num_trajectories=num_trajectories
    )
    
    return TradingEnvironment(**env_params)

In [None]:
num_trajectories = 1000
env = ReduceStateSizeWrapper(get_new_env(num_trajectories))
sb_env = StableBaselinesTradingEnvironment(trading_env=env)

In [None]:
# Monitor sb_env
sb_env = VecMonitor(sb_env)
# Add directory for tensorboard logging and best model
tensorboard_logdir = "./tensorboard/PPO-learning-CJ/"
best_model_path = "./SB_models/PPO-best-CJ"

### Define the Policy: Robust PPO 

The simplicity and synchronous nature of A2C might be advantageous in high-frequency trading scenarios where decisions need to be made quickly. It is more suitable in dynamic trading environments.

In [None]:

# Define lists of values for each significant A2C parameter
learning_rate_list = [0.0001, 0.001, 0.01]
n_steps_list = [5, 10, 20]
gamma_list = [0.95, 0.99]
gae_lambda_list = [0.9, 0.95, 1.0]
ent_coef_list = [0.01, 0.05, 0.1]
vf_coef_list = [0.25, 0.5, 0.75]
normalize_advantage_list = [True, False]

# Define the policy network architecture
policy_kwargs = dict(net_arch=[dict(pi=[256, 256], vf=[256, 256])])

In [None]:
callback_params = dict(eval_env=sb_env, n_eval_episodes = 2048, 
                        best_model_save_path = best_model_path, 
                        deterministic=True)

callback = EvalCallback(**callback_params)

# Iterate over all combinations of A2C parameter values
for learning_rate, n_steps, gamma, gae_lambda, ent_coef, vf_coef, normalize_advantage in itertools.product(
        learning_rate_list, n_steps_list, gamma_list, gae_lambda_list, ent_coef_list, vf_coef_list, normalize_advantage_list):

        # Print current combination of parameters
        print(f"Training with: learning_rate={learning_rate}, n_steps={n_steps}, gamma={gamma}, gae_lambda={gae_lambda}, "
                f"ent_coef={ent_coef}, vf_coef={vf_coef}, normalize_advantage={normalize_advantage}")

        A2C_params = {"policy":'MlpPolicy', "env": sb_env, "verbose":1, 
                "policy_kwargs":policy_kwargs, 
                "tensorboard_log":tensorboard_logdir,
                "learning_rate": learning_rate, 
                "gamma":gamma, 
                "gae_lambda":gae_lambda, 
                "ent_coef":ent_coef, 
                "vf_coef":vf_coef, 
                "max_grad_norm":0.5, 
                "rms_prop_eps":1e-5, 
                "n_steps": int(n_steps),
                "normalize_advantage":normalize_advantage
                }

        # Initialize the A2C model with the current set of parameters
        model = A2C(**A2C_params,device="cpu")
        
        # Train the model
        model.learn(total_timesteps=10_000_000)


### Define the Agent

In [None]:
from mbt_gym.agents.SbAgent import SbAgent
A2C_agent = SbAgent(model)

### Test

In [None]:
class HistoricalDataEnv(gym.Env):
    def __init__(self, data: pd.DataFrame, initial_inventory=0, max_inventory=100, window_size=10):
        super(HistoricalDataEnv, self).__init__()
        self.data = data
        self.initial_inventory = initial_inventory
        self.max_inventory = max_inventory
        self.window_size = window_size
        self.current_step = 0
        self.inventory = initial_inventory
        
        # Define action and observation space
        self.action_space = gym.spaces.Discrete(3)  # 0: Hold, 1: Bid, 2: Ask
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(window_size, len(data.columns)), dtype=np.float32)
        
    def reset(self):
        self.current_step = 0
        self.inventory = self.initial_inventory
        return self._next_observation()
    
    def _next_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size].values
        obs = np.append(obs, [[self.inventory] * len(self.data.columns)], axis=0)
        return obs
    
    def step(self, action):
        current_price = self.data.iloc[self.current_step]['mid_price']
        reward = 0
        
        if action == 1:  # Buy
            self.inventory += 1
            reward -= current_price
        elif action == 2:  # Sell
            self.inventory -= 1
            reward += current_price
        
        self.current_step += 1
        
        if self.current_step >= len(self.data) - self.window_size:
            done = True
        else:
            done = False
        
        obs = self._next_observation()
        return obs, reward, done, {}
    
    def render(self, mode='human', close=False):
        pass

# Load historical data
data = pd.read_csv('historical_data.csv')  

# Create the environment with historical data
env = HistoricalDataEnv(data)

In [None]:
# Load the trained model
model = A2C.load("a2c_mbt_gym_model")

# Reset the environment
obs = env.reset()

# Run the backtest
for _ in range(len(data) - env.window_size):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    if done:
        break

### Result Analysis & Visualization