In [13]:
# Step 1: Define RL Environment (custom OpenAI Gym environment)
import gym
from gym import spaces
import numpy as np

class MarketMakingEnv(gym.Env):
    def __init__(self, initial_cash=100000, max_steps=1000):
        super(MarketMakingEnv, self).__init__()

        self.initial_cash = initial_cash
        self.max_steps = max_steps
        self.current_step = 0

        # Define action space and observation space
        self.action_space = spaces.Discrete(3)  # Example: 3 discrete actions (buy, sell, hold)
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(5,), dtype=np.float32)  # Example: 5-dimensional observation space

        # Initialize other variables and state
        self.cash = initial_cash
        self.inventory = 0
        self.price_history = np.zeros(max_steps)  # Placeholder for price history
        self.reward_range = (-np.inf, np.inf)

    def reset(self):
        # Reset environment to initial state
        self.current_step = 0
        self.cash = self.initial_cash
        self.inventory = 0
        self.price_history = np.zeros(self.max_steps)  # Reset price history
        return self._next_observation()

    def step(self, action):
        # Take action in the environment
        self.current_step += 1

        # Example: Simulate price movement (replace with your market simulation logic)
        current_price = 100 + np.sin(self.current_step / 10) * 10

        # Example: Execute action (replace with your trading logic)
        if action == 0:  # Buy
            self.inventory += 1
            self.cash -= current_price
        elif action == 1:  # Sell
            self.inventory -= 1
            self.cash += current_price
        elif action == 2:  # Hold
            pass

        # Update price history
        self.price_history[self.current_step - 1] = current_price

        # Calculate reward (replace with your reward function)
        reward = self._calculate_reward()

        # Check if done
        done = self.current_step >= self.max_steps

        # Prepare next state
        next_state = self._next_observation()

        return next_state, reward, done, {}

    def _next_observation(self):
        # Example: Construct observation (replace with your state representation)
        observation = np.array([self.cash, self.inventory, self.price_history[self.current_step - 1], 0, 0])
        return observation

    def _calculate_reward(self):
        # Example: Simple reward function (replace with your own reward logic)
        return self.cash + self.inventory * self.price_history[self.current_step - 1]

    def render(self, mode='human'):
        # Render environment state (optional)
        pass

# Test the environment
if __name__ == "__main__":
    env = MarketMakingEnv()
    observation = env.reset()
    for _ in range(10):
        action = env.action_space.sample()  # Random action for testing
        observation, reward, done, _ = env.step(action)
        print(f"Action: {action}, Reward: {reward}, Done: {done}, Cash: {env.cash}, Inventory: {env.inventory}")


Action: 0, Reward: 100000.0, Done: False, Cash: 99899.00166583354, Inventory: 1
Action: 0, Reward: 100000.9883591415, Done: False, Cash: 99797.01497252559, Inventory: 2
Action: 2, Reward: 100002.92537665882, Done: False, Cash: 99797.01497252559, Inventory: 2
Action: 0, Reward: 100004.80333937176, Done: False, Cash: 99693.1207891025, Inventory: 3
Action: 2, Reward: 100007.50355526063, Done: False, Cash: 99693.1207891025, Inventory: 3
Action: 2, Reward: 100010.06006330435, Done: False, Cash: 99693.1207891025, Inventory: 3
Action: 1, Reward: 100012.44731971963, Done: False, Cash: 99799.56296597488, Inventory: 2
Action: 1, Reward: 100013.91008779287, Done: False, Cash: 99906.73652688388, Inventory: 1
Action: 2, Reward: 100014.56979598015, Done: False, Cash: 99906.73652688388, Inventory: 1
Action: 1, Reward: 100015.15123673195, Done: False, Cash: 100015.15123673195, Inventory: 0


In [None]:
pip install stable_baselines3

Collecting gymnasium<0.30,>=0.28.1 (from stable_baselines3)
  Using cached gymnasium-0.29.1-py3-none-any.whl (953 kB)
Installing collected packages: gymnasium
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 1.0.0a2
    Uninstalling gymnasium-1.0.0a2:
      Successfully uninstalled gymnasium-1.0.0a2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
shimmy 2.0.0 requires gymnasium>=1.0.0a1, but you have gymnasium 0.29.1 which is incompatible.[0m[31m
[0mSuccessfully installed gymnasium-0.29.1


In [None]:
pip install 'shimmy>=0.2.1'

Collecting gymnasium>=1.0.0a1 (from shimmy>=0.2.1)
  Using cached gymnasium-1.0.0a2-py3-none-any.whl (954 kB)
Installing collected packages: gymnasium
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 0.29.1
    Uninstalling gymnasium-0.29.1:
      Successfully uninstalled gymnasium-0.29.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
stable-baselines3 2.3.2 requires gymnasium<0.30,>=0.28.1, but you have gymnasium 1.0.0a2 which is incompatible.[0m[31m
[0mSuccessfully installed gymnasium-1.0.0a2


In [None]:
# Step 3: Evaluate and Deploy RL Agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# Optionally, deploy the model for live trading or further evaluation
# Example: Load the model and interact with the environment
loaded_model = PPO.load("market_making_model")

# Test the loaded model
obs = env.reset()
for _ in range(10):
    action, _states = loaded_model.predict(obs, deterministic=True)
    obs, rewards, dones, info = env.step(action)
    env.render()

# Optionally, save the model for deployment
loaded_model.save("market_making_model_final")


In [7]:
import gym
from gym import spaces
import numpy as np
import pandas as pd

class MarketMakingEnv(gym.Env):
    def __init__(self, data):
        super(MarketMakingEnv, self).__init__()
        self.data = data.reset_index(drop=True)  # Reset index for consistent access
        self.current_step = 0
        self.max_steps = len(data) - 1

        # Action space: discrete actions (buy, sell, hold)
        self.action_space = spaces.Discrete(3)

        # Observation space: includes bid/ask prices, volumes, trade volumes
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(7,), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        return self._next_observation()

    def step(self, action):
        self.current_step += 1

        # Execute action: For simplicity, actions are placeholders
        # Implement actual logic to interact with data and execute trades
        reward = 0.0  # Placeholder reward function

        # Calculate reward based on action and current state

        done = self.current_step >= self.max_steps
        info = {}

        return self._next_observation(), reward, done, info

    def _next_observation(self):
        # Construct observation based on current step in data
        obs = self.data.iloc[self.current_step].values
        return obs

    def render(self, mode='human', close=False):
        pass


In [23]:
# Step 2: Define and Train RL Agent (PPO example)
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Create environment
env = DummyVecEnv([lambda: MarketMakingEnv()])

# Initialize and train RL model
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Save the trained model





Using cpu device
-----------------------------
| time/              |      |
|    fps             | 942  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 591       |
|    iterations           | 2         |
|    time_elapsed         | 6         |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.1      |
|    explained_variance   | 0         |
|    learning_rate        | 0.0003    |
|    loss                 | 1.35e+12  |
|    n_updates            | 10        |
|    policy_gradient_loss | -2.83e-06 |
|    value_loss           | 2.73e+12  |
---------------------------------------
---------------------------------------
| time/      

<stable_baselines3.ppo.ppo.PPO at 0x7a9e62fa59f0>

In [9]:
pip install -U stable-baselines3


Collecting gymnasium<0.30,>=0.28.1 (from stable-baselines3)
  Using cached gymnasium-0.29.1-py3-none-any.whl (953 kB)
Installing collected packages: gymnasium
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 1.0.0a2
    Uninstalling gymnasium-1.0.0a2:
      Successfully uninstalled gymnasium-1.0.0a2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
shimmy 2.0.0 requires gymnasium>=1.0.0a1, but you have gymnasium 0.29.1 which is incompatible.[0m[31m
[0mSuccessfully installed gymnasium-0.29.1


In [10]:
pip install gymnasium




In [18]:
def evaluate_model(model, env, num_episodes=10):
    episode_rewards = []
    for _ in range(num_episodes):
        obs = env.reset()
        episode_reward = 0.0
        done = False
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
        episode_rewards.append(episode_reward)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)
    return mean_reward, std_reward

# Evaluate the trained RL model
mean_reward, std_reward = evaluate_model(model, env, num_episodes=10)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")


Mean reward: 99822384.00 +/- 0.00


In [22]:
import pandas as pd

# Load your dataset
data = pd.read_csv('/content/random_market_data.csv')

# Assuming the previous steps for defining environment, training model, etc.

# Evaluate RL agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# Optionally, deploy the trained RL agent for live trading or further evaluation
# Example: Load the trained model and interact with the environment
loaded_model = PPO.load("market_making_model")

# Test the loaded model
obs = env.reset()
for _ in range(len(data) - 1):
    action, _states = loaded_model.predict(obs, deterministic=True)
    obs, rewards, dones, info = env.step(action)
    env.render()

# Optionally, save the final model for deployment





Mean reward: 99822349.33 +/- 0.00


