In [None]:
import gym
from gym import spaces
import numpy as np

class LiquidationEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self, n_assets=3, initial_shares=100, price_std=0.1, max_steps=100):
        super(LiquidationEnv, self).__init__()
        
        # Environment parameters
        self.n_assets = n_assets
        self.initial_shares = np.full(n_assets, initial_shares, dtype=np.float32)
        self.price_std = price_std
        self.max_steps = max_steps
        
        # Define action and observation spaces
        self.action_space = spaces.Box(
            low=0,
            high=initial_shares,
            shape=(n_assets,),
            dtype=np.float32
        )
        
        self.observation_space = spaces.Dict({
            "prices": spaces.Box(low=0, high=np.inf, shape=(n_assets,), dtype=np.float32),
            "remaining": spaces.Box(low=0, high=initial_shares, shape=(n_assets,), dtype=np.float32),
            "acc_revenue": spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float32)
        })
        
        # Initialize state
        self.state = None
        self.current_step = 0
        self.reset()

    def _get_obs(self):
        return {
            "prices": self.state['prices'].copy(),
            "remaining": self.state['remaining'].copy(),
            "acc_revenue": np.array([self.state['acc_revenue']], dtype=np.float32)
        }

    def _next_price(self):
        # Simple random walk price model (customize this with your own price model)
        return self.state['prices'] * np.exp(np.random.normal(0, self.price_std, self.n_assets))

    def reset(self):
        # Reset initial prices (customize with your price initialization)
        self.state = {
            'prices': np.abs(np.random.normal(100, 10, self.n_assets)).astype(np.float32),
            'remaining': self.initial_shares.copy(),
            'acc_revenue': 0.0
        }
        self.current_step = 0
        return self._get_obs()

    def step(self, action):
        # Clip actions to valid range
        action = np.clip(action, 0, self.state['remaining'])
        
        # Calculate revenue from current prices
        step_revenue = np.sum(action * self.state['prices'])
        
        # Update state
        self.state['remaining'] -= action
        self.state['acc_revenue'] += step_revenue
        
        # Update prices for next step
        self.state['prices'] = self._next_price()
        
        # Update step counter
        self.current_step += 1
        
        # Check termination conditions
        done = (np.sum(self.state['remaining']) <= 0) or (self.current_step >= self.max_step)
        
        # Reward is the immediate revenue gained
        reward = step_revenue
        
        return self._get_obs(), reward, done, {}

    def render(self, mode='human'):
        print(f"Step: {self.current_step}")
        print(f"Prices: {self.state['prices']}")
        print(f"Remaining: {self.state['remaining']}")
        print(f"Accumulated Revenue: {self.state['acc_revenue']:.2f}\n")

    def close(self):
        pass

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

# Create environment
env = LiquidationEnv(n_assets=3, initial_shares=100)

# Verify environment compatibility
check_env(env)

# Create and train model
model = PPO("MultiInputPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Test trained model
obs = env.reset()
for _ in range(100):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    if done:
        break