In [1]:
import numpy as np

class PriceModel():
    '''
    Note:This is a stateless class, gathering price evolution models in one place
    '''
    def price_model_1(current_price, current_action, tau, vol_matrix, perm_impact_matrix, random_vector):
        return current_price + np.sqrt(tau) * (vol_matrix@random_vector) - perm_impact_matrix @ current_action
    def price_model_2(current_price):
        pass

In [2]:
import gym
from gym import spaces
import pandas as pd


class LiquidationEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self, 
                 n_assets=3, 
                 initial_shares=100, 
                 initial_prices=50, 
                 max_steps = 5,
                 price_model=PriceModel,
                 tau = 1,
                 temp_price_matrix = np.identity(3)*0.1,
                 vol_matrix = np.identity(3)*.4,
                 perm_impact_matrix = np.identity(3)*0.052
                 ):
        super(LiquidationEnv, self).__init__()
        
        # Environment parameters
        self.n_assets = n_assets
        self.initial_shares = np.full(n_assets, initial_shares, dtype=np.float32)
        self.initial_prices = np.full(n_assets, initial_prices, dtype=np.float32)
        self.max_steps = max_steps
        self.price_generator = price_model.price_model_1
        self.temp_price_matrix = temp_price_matrix
        self.tau = tau
        self.vol_matrix = vol_matrix
        self.perm_impact_matrix = perm_impact_matrix
        
        # Define action and observation spaces
        self.action_space = spaces.Box(
            low=0,
            high= 1,
            shape=(n_assets,),
            dtype=np.float32
        )
        
        self.observation_space = spaces.Dict({
            "prices": spaces.Box(low = -np.inf, high=np.inf, shape=(n_assets,), dtype=np.float32),
            "remaining": spaces.Box(low = 0, high=1, shape=(n_assets,), dtype=np.float32)
            #"acc_revenue": spaces.Box(low = -np.inf, high=np.inf, shape=(1,), dtype=np.float32)
        })
        
        # Initialize state
        self.state = None
        self.current_step = 0
        self.reset()

    def _get_obs(self):
        return {
            "prices": self.state['prices'].copy().astype(np.float32)/self.initial_prices,
            "remaining": self.state['remaining'].copy().astype(np.float32)/self.initial_shares
            #"acc_revenue": np.array([self.state['acc_revenue']], dtype=np.float32)
        }

    def _next_price(self, current_price , current_action, tau, vol_matrix, perm_impact_matrix, random_vector):
        # actual_action = self.state['remaining'] * current_action
        return self.price_generator(current_price, current_action, tau, vol_matrix, perm_impact_matrix, random_vector)

    def reset(self):
        # Reset initial prices (customize with your price initialization)
        self.state = {
            'prices': self.initial_prices.copy()/self.initial_prices,
            'remaining': self.initial_shares.copy()/self.initial_shares
            #'acc_revenue': 0.0
        }
        self.current_step = 0
        return self._get_obs()
    
    def _get_reward(self, state, action, temp_price_matrix):
        '''
        The function to calculate the reward
        '''
        # actual_action = action * state['remaining']
        reward = action.dot(state['prices']*self.initial_prices - temp_price_matrix.dot(action))/(np.dot(self.initial_prices, self.initial_shares))
        return reward

    def step(self, action):
        # TODO: need a better way than clipping
        if self.current_step == self.max_steps - 1:
            actual_action = self.state['remaining']*self.initial_shares
        else:
            actual_action = np.minimum(self.state['remaining']*self.initial_shares, action * self.initial_shares)
        
        reward = self._get_reward(self.state, actual_action, self.temp_price_matrix)
        # Update state
        self.state['remaining'] = (self.state['remaining']*self.initial_shares - actual_action)/self.initial_shares
        # step_revenue = np.sum(actual_action * (self.state['prices'] - self.temp_price_matrix.dot(actual_action))) # Calculate revenue from current prices
        # self.state['acc_revenue'] += step_revenue # TODO: what's the third part of the state? what's the formula to calculate it?

        random_vector = np.random.normal(size = self.n_assets)
        self.state['prices'] = self._next_price(self.state['prices']*self.initial_prices , actual_action, self.tau, self.vol_matrix, self.perm_impact_matrix, random_vector)/self.initial_prices
        
        # Update step counter
        
        self.current_step += 1
        
        # Check termination conditions
        done = (np.sum(self.state['remaining']) <= 0) or (self.current_step >= self.max_steps)
            
        return self._get_obs(), reward, done, {}

    def render(self, mode='human'):
        print(f"Step: {self.current_step}")
        print(f"Prices: {self.state['prices']}")
        print(f"Remaining: {self.state['remaining']}")
        #print(f"Accumulated Revenue: {self.state['acc_revenue']:.2f}\n")
        
    def close(self):
        pass

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import EvalCallback
env = LiquidationEnv(n_assets=3, initial_shares=100)
check_env(env)




In [4]:
env.reset()
env.render()

Step: 0
Prices: [1. 1. 1.]
Remaining: [1. 1. 1.]


In [10]:
action = np.ones(1)*0.1
env.step(action)

env.render()

Step: 6
Prices: [0.86499436 0.89342401 0.88711833]
Remaining: [0. 0. 0.]


In [11]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import EvalCallback


env = LiquidationEnv(n_assets=3, initial_shares = 100)
env_eval = LiquidationEnv(n_assets=3, initial_shares=100)
eval_callback = EvalCallback(env_eval, 
                            best_model_save_path=f"../log/model/test/",
                            eval_freq=100,
                            deterministic=True, 
                            render=False,
                            verbose=0)
model = PPO("MultiInputPolicy", env, gamma=0.9999999, tensorboard_log="../log/tensorboard_test/", verbose=0)
model.learn(int(8e5), progress_bar=True, callback=eval_callback)


Output()

KeyboardInterrupt: 

In [22]:
env = LiquidationEnv(n_assets=5, initial_shares = 100, 
                     vol_matrix= np.identity(5),
                     perm_impact_matrix= 0.05* np.identity(5), temp_price_matrix= 0.1*np.identity(5))
env_eval = LiquidationEnv(n_assets=5, initial_shares =  100, 
                     vol_matrix= np.identity(5),
                     perm_impact_matrix= 0.05* np.identity(5), temp_price_matrix= 0.1*np.identity(5))
eval_callback = EvalCallback(env_eval, 
                            best_model_save_path=f"../log/model/test/",
                            eval_freq=100,
                            deterministic=True, 
                            render=False,
                            verbose=0)
model = PPO("MultiInputPolicy", env, gamma=0.9999999999, tensorboard_log="../log/tensorboard_test/", verbose=0)
model.learn(int(8e5), progress_bar=True, callback=eval_callback)

Output()

<stable_baselines3.ppo.ppo.PPO at 0x7c727b4c9550>

In [23]:
from stable_baselines3 import PPO

# Load the best model from EvalCallback's auto-saved checkpoints
best_model = PPO.load("../log/model/test/best_model.zip")

In [24]:
def get_liquidation_actions(model, env, n_episodes=5):
    all_episode_actions = []
    all_remaining_shares = []

    for _ in range(n_episodes):
        obs = env.reset()
        done = False
        episode_actions = []
        remaining_shares = [env.state['remaining'].copy()]
        prices = [env.initial_prices.copy()]
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            episode_actions.append(action)
            obs, _, done, _ = env.step(action)
            remaining_shares.append(env.state['remaining'].copy())
            prices.append(env.state['prices'].copy())
        all_episode_actions.append(episode_actions)
        all_remaining_shares.append(np.array(remaining_shares))
    average_actions = np.mean(all_episode_actions, axis=0)
    average_remaining = np.mean(all_remaining_shares, axis =0)
    return average_actions, average_remaining

In [27]:
episode_actions, remaining_shares = get_liquidation_actions(
    best_model, 
    env_eval,  # Use your evaluation environment
    n_episodes=50# For example, get one episode sample
)


In [28]:
episode_actions

array([[0.17651263, 0.18419257, 0.18629867, 0.18166551, 0.19136107],
       [0.18256962, 0.19360909, 0.19116244, 0.18924311, 0.20006947],
       [0.18884541, 0.2033678 , 0.19620436, 0.19709718, 0.20909494],
       [0.19533601, 0.21347328, 0.20140977, 0.20522076, 0.21843283],
       [0.20203862, 0.22391996, 0.20677942, 0.21361296, 0.22807972]],
      dtype=float32)

In [30]:
remaining_shares*100

array([[100.      , 100.      , 100.      , 100.      , 100.      ],
       [ 82.34875 ,  81.58076 ,  81.37013 ,  81.83344 ,  80.86389 ],
       [ 64.095146,  62.22479 ,  62.25592 ,  62.91235 ,  60.860687],
       [ 45.209526,  41.88778 ,  42.6326  ,  43.20042 ,  39.948895],
       [ 25.663488,  20.525743,  22.477911,  22.662226,  18.088137],
       [  0.      ,   0.      ,   0.      ,   0.      ,   0.      ]],
      dtype=float32)

In [29]:
# Get liquidation trajectories
episode_actions, remaining_shares = get_liquidation_actions(
    best_model, 
    env_eval,  # Use your evaluation env
    n_episodes= 1  # Get 5 episode samples
)

# Inspect first episode
print("=== Optimal Liquidation Strategy ===")
for step, (action, remaining) in enumerate(zip(episode_actions[0], remaining_shares[0][:-1])):
    print(f"Step {step+1}:")
    print(f"  Action (% to liquidate): {action}")
    print(f"  Remaining shares: {remaining}\n")

# Verify full liquidation at last step
final_remaining = remaining_shares[0][-1]
print(f"Final remaining shares: {final_remaining} (Should be all zeros)")

=== Optimal Liquidation Strategy ===
Step 1:
  Action (% to liquidate): 0.17651253938674927
  Remaining shares: 1.0

Step 2:
  Action (% to liquidate): 0.18419243395328522
  Remaining shares: 1.0

Step 3:
  Action (% to liquidate): 0.18629863858222961
  Remaining shares: 1.0

Step 4:
  Action (% to liquidate): 0.1816655993461609
  Remaining shares: 1.0

Final remaining shares: 1.0 (Should be all zeros)


In [244]:
# action = 0.5
# obs, _, done, _ = env.step(action)

obs['remaining'] = np.array([59], dtype=np.float32)
action, _ = model.predict(obs, deterministic=True)
action
            

array([0.4336197], dtype=float32)

In [235]:
obs

NameError: name 'obs' is not defined

In [207]:
action

array([0.6569459], dtype=float32)