In [1]:
# Data Handling
import pandas as pd
import numpy as np

# Generic
from collections import defaultdict
from tqdm.auto import tqdm

# Data Visualization
import matplotlib.pyplot as plt 

# Reinforcement Learning
import gym
from data.data_gen import DataGenerator
from Environment.tiles3 import IHT, tiles

# Custom Modules
from Environment.market_making import MarketMakerEnv

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Set the seed for reproducibility
SEED = 42
np.random.seed(SEED)

## Setup the training environment

In [2]:
lob_data = DataGenerator._generator('data/lob.csv', levels=1)
lob_data = lob_data.head(3000).values

In [3]:
# Create the environment
env = MarketMakerEnv(lob_data, 2500)
initial_state = env.reset()
display(initial_state)

array([ 2.23990000e+02,  1.00000000e+02,  2.23750000e+02,  7.40000000e+01,
        2.23870000e+02,  2.40000000e-01, -5.00000000e-03, -1.49425287e-01,
       -2.60000000e+01, -2.23338917e-05,  2.40100065e-05,  4.16666667e+01])

In [4]:
# Tile coding parameters
num_tilings = 8
tile_width = 8
iht_size = 4096  # Size of the index hash table

# Create the index hash table (IHT)
iht = IHT(iht_size)

def get_tile_coding(state, num_tilings, tile_width):
    scaled_state = [state[i] * tile_width for i in range(len(state))]
    tile_indices = tiles(iht, num_tilings, scaled_state)
    return tile_indices


In [5]:
alpha = 1e-2 / num_tilings  # Learning rate
gamma = 0.99  # Discount factor
lmbda = 0.9  # Trace decay parameter
epsilon = 0.1  # Exploration rate
num_episodes = 500  # Number of episodes for training


In [6]:
num_actions = env.action_space.n
weights = np.zeros(iht_size)
eligibility_trace = np.zeros(iht_size)

def get_q_value(state, action):
    tile_indices = get_tile_coding(state + [action], num_tilings, tile_width)
    q_value = np.sum(weights[tile_indices])
    return q_value, tile_indices

def select_action(state):
    if np.random.random() < epsilon:
        return env.action_space.sample()  # Explore
    else:
        q_values = [get_q_value(state, action)[0] for action in range(num_actions)]
        return np.argmax(q_values)  # Exploit


In [7]:
for episode in tqdm(range(num_episodes)):
    state = env.reset()
    done = False
    total_reward = 0

    # Select the initial action using the epsilon-greedy policy
    action = select_action(state)

    while not done:
        next_state, reward, done, info = env.step(action)
        next_action = select_action(next_state)
        
        # Get Q-value and tile indices for the current state-action pair
        q_value, tile_indices = get_q_value(state, action)
        
        # Get Q-value for the next state-action pair
        next_q_value, _ = get_q_value(next_state, next_action)

        # Calculate the TD error
        td_error = reward + gamma * next_q_value - q_value

        # Update eligibility traces
        eligibility_trace *= gamma * lmbda
        eligibility_trace[tile_indices] += 1

        # Update weights using TD error and eligibility traces
        weights += alpha * td_error * eligibility_trace

        # Move to the next state and action
        state = next_state
        action = next_action
        total_reward += reward

    # Decay epsilon after each episode for exploration-exploitation balance
    epsilon = max(0.01, epsilon * 0.995)

    if episode % 100 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}")

  0%|          | 0/500 [00:00<?, ?it/s]

Episode 0, Total Reward: -1306219.3899987824
Episode 100, Total Reward: -502965.5499995865
Episode 200, Total Reward: -542114.6699995514
Episode 300, Total Reward: -135729.67999997275
Episode 400, Total Reward: -595848.2899994311


In [9]:
epsilon = 0  # No exploration, only exploitation
state = env.reset()
done = False
total_reward = 0

while not done:
    action = select_action(state)
    state, reward, done, info = env.step(action)
    total_reward += reward

print(f"Total Reward: {total_reward}")

Total Reward: -170648.96499983507
