# Multi-Agent Deep Deterministic Policy Gradient for Stock Market

## Stock Market Environment

- __Hyperparameters__
- __Observation Space__
  - `stock_price`: `ndarray` of shape $[N_{stock}, ]$
  - `correlated_stock`: `ndarray` of shape $[N_{correlated}, ]$
  - `uncorrelated_stock`: `ndarray` of shape $[N_{uncorrelated}, ]$
  - `budgets`: `ndarray` of shape $[N_{agents}, ]$
  - `shares_held`: `ndarray` of shape $[N_{agents}, ]$
  - `agent_views`: `ndarray` of shape $[N_{agents}, N_{stock}]$
  - `company_states`: `ndarray` of shape $[N_{company}, ]$
- __Action Space__
  - dimension_1: log buy/sell prices $\log p\in\left(-\infty, +\infty\right)$ => `gym.spaces.Box`
  - dimension_2: discrete shares $s\in\mathbb{N}$ => `gym.spaces.Discrete`


In [35]:
from typing import Dict, Optional, Sequence, Tuple, Union

import numpy as np
from gym.core import ActType, ObsType, Env
from gym.spaces import Box, MultiDiscrete, Tuple as TupleSpace


class StockMarketEnv(Env):
    
    def __init__(self,
                 num_agents: int,
                 budge_discount: float = 0.9,
                 num_company: int = 5,
                 num_correlated_stocks: int = 19,
                 num_uncorrelated_stocks: int = 10,
                 max_shares: int = 100000,
                 start_prices: Union[float, Sequence[float]] = 100.0,
                 min_budget: float = 100.0,
                 max_budget: float = 10000.0,
                 budget_discount: float = 0.9,
                 step_size: float = 1.0,
                 price_std: float = 100.0,
                 noise_std: float = 10.0,
                 seed: int = 0) -> None:
        super().__init__()

        # Agent Parameters
        self.num_agents = num_agents
        self.num_company = num_company
        self.min_budget = min_budget
        self.max_budget = max_budget
        self.budget_discount = budge_discount
        self.max_shares = max_shares

        # Stock Market Parameters
        self.dt = step_size
        self.start_prices = start_prices
        self.price_std = price_std
        self.noise_std = noise_std

        # Observation and Action spaces
        self.n_correlated_stocks = num_correlated_stocks
        self.n_uncorrelated_stocks = num_uncorrelated_stocks
        self.n_stocks = num_correlated_stocks + num_uncorrelated_stocks + 1
        self.observation_space = Box(low=0.0,
                                     high=float("inf"),
                                     shape=(self.num_agents, self.n_stocks))
        self.action_space = TupleSpace(
            (Box(low=-float("inf"),
                 high=float("inf"),
                 shape=(self.num_agents, self.n_stocks)),
             MultiDiscrete([[max_shares] * self.n_stocks] * self.num_agents))
        )
        self._seed = seed

    def reset(self,
              seed: Optional[int] = None,
              return_info: bool = True) -> Tuple[ObsType, Dict]:
        self.rng = np.random.default_rng(seed=seed or self._seed)        

        correlated_stocks = np.clip(
            np.random.normal(loc=self.start_prices,
                             scale=self.price_std,
                             size=(self.n_correlated_stocks, )),
            a_min=1, a_max=None
        )
        uncorrelated_stocks = np.clip(
            np.random.normal(loc=self.start_prices,
                             scale=self.price_std,
                             size=(self.n_uncorrelated_stocks,)),
            a_min=1, a_max=None
        )
        self.eta = np.clip(
            np.random.normal(loc=1.5, scale=1.5, size=(self.num_agents, )),
            a_min=0, a_max=10
        )
        self.valid_mask = np.zeros(shape=(self.num_agents, self.n_stocks),
                                   dtype="bool")
        self.valid_mask[:, 1:1+self.n_correlated_stocks] = True
        self.valid_mask[self.rng.integers(low=0, high=self.num_agents),
                        1 + self.n_correlated_stocks:] = True

        self.prices = np.asarray(self.start_prices)
        self.budgets = self.min_budget + self.rng.random(
            size=(self.num_agents), dtype="float32") * (
                self.max_budget - self.min_budget)
        self.shares = self.rng.integers(low=1,
                                        high=self.max_shares,
                                        size=(self.num_agents, self.n_stocks))

        return (self.prices,
                {
                    "correlated_stocks": correlated_stocks,
                    "uncorrelated_stocks": uncorrelated_stocks,
                    "budgets": self.budgets,
                    "shares": self.shares,
                    "valid_mask": self.valid_mask,
                    "company_states": None  # TODO: Company states
                })
    
    def is_terminated(self) -> bool:
        return True

    def step(self, action: Tuple[np.ndarray, np.ndarray]) -> Tuple:
        assert (len(action) == 2 and
                action[0].shape == (self.num_agents, self.n_stocks) and
                action[1].shape == (self.num_agents, self.n_stocks))
        # TODO
        proposed_prices = 1. + np.exp(action[0])
        proposed_shares = action[1]

        # Update budgets and shares
        potential_budgets = self.budgets + \
            (proposed_prices * (-proposed_shares)).sum(-1)
        potential_shares = self.shares + proposed_shares
        print("Current budgets: \n", potential_budgets,
              "\nCurrent shares: \n", potential_shares)
        rewards = np.where(
            np.logical_or(potential_budgets < 0.0,
                          np.any(potential_shares < 0.0, axis=-1)),
            -100, 0.0
        )
        print("Rewards", rewards)
        curr_prices = self.prices
        
        # TODO


        # TODO

        dones = self.is_terminated()
        if dones:
            next_s, _ = self.reset()

        return 

    @staticmethod
    def utility(c: float, eta: float) -> float:
        if eta!= 1.0:
            return (c ** (1.0 - eta) - 1.0) / (1.0 - eta)
        else:
            return np.log(c)

In [36]:
env = StockMarketEnv(10)
env.reset()
random_action = env.action_space.sample()

env.step(random_action)

Current budgets: 
 [-27991.42362094 -34178.41267729 -35891.77252471 -37679.80823529
 -28668.28449202 -30585.09359443 -38112.77494562 -39650.0941211
 -34963.18385458 -40977.61369228] 
Current shares: 
 [[1484 1084  874 1722  790  738  632  981 1233  964 1773 1594  170  739
  1737 1070  954  836 1193 1766  763  630 1780  100  614  867  905  656
   694  837]
 [ 901   65  285  760  834  718 1305 1028  896 1513 1300  732 1520 1772
  1731  940  780 1930 1643 1055  899 1620  485 1600 1024 1432 1497  851
   842 1002]
 [ 680  979 1288 1587 1220  662 1828  627 1331 1392  657  706 1110 1577
   886 1109  394  844 1159  640 1636  907  771 1453 1416   84  534  670
  1154 1186]
 [1769  664 1006  804 1715  909  749 1240  650 1330  673 1422 1099 1214
  1009 1681  948  973  696  943  891 1376  603 1633  437 1555 1536 1478
   885 1355]
 [1624 1135  920 1581 1328  710  748 1583  378  787  974  853  995 1090
   681  976 1628  836  930  425 1392 1659 1175  384 1946  692 1394   18
   644 1181]
 [1590 1305  8

(None, None)

---

## MADDPG Trainer

The `MADDPG Trainer` class is a generic version of the `DDPG` trainer initialized with
- A sequence of `DDPG Agent` class objects
- A shared observation buffer.

### DDPG Implementation

In [8]:
import gym
import numpy as np
import torch as th
from gym.core import Env
from gym.spaces import Discrete
from src.agent.ddpg_agent import DDPGAgent
from src.critic.ddpg_critic import DDPGCritic
from src.memory.base_buffer import Path
from src.memory.replay_buffer import ReplayBuffer
from src.policy.ddpg_policy import DDPGPolicy

In [2]:
env = gym.make("LunarLander-v2", new_step_api=True)
print(env.observation_space.shape, env.action_space.n)

(8,) 4


In [3]:
if isinstance(env.action_space, Discrete):
    action_size = 1
else:
    action_size = env.action_space.shape

In [4]:
agent = DDPGAgent(
    observation_size=env.observation_space.shape[0],
    action_size=action_size,
    policy_net="MLP",
    policy_net_kwargs={"hidden_size": 64, "num_layers": 2},
    critic_net="MLP",
    critic_net_kwargs={"hidden_size": 64, "num_layers": 2}
)

In [5]:
replay_buffer = ReplayBuffer(max_size=1000)
while len(replay_buffer) < 1000:
    obs, acs, next_obs, rews, dones = [], [], [], [], []
    ob = env.reset()
    done = False
    while not done:
        ac = env.action_space.sample()
        next_ob, rew, done, _, _ = env.step(ac)
        
        obs.append(ob)
        acs.append(ac)
        next_obs.append(next_ob)
        rews.append(rew)
        dones.append(done)
    
    path = Path(observation=np.asarray(obs, dtype="float32"),
                action=np.asarray(acs, dtype="float32"),
                next_observation=np.asarray(next_obs, dtype="float32"),
                reward=np.asarray(rews, dtype="float32"),
                done=np.asarray(dones, dtype="int64"))
    replay_buffer.add([path], noised=False)
        

In [6]:
# Main Training Loop
obs, acs, next_obs, rews, dones = replay_buffer.sample(64)
agent.train_one_step(obs, acs, next_obs, rews, dones)

torch.Size([244, 1]) torch.Size([244, 244])


  return F.mse_loss(input, target, reduction=self.reduction)


In [7]:
obs = th.from_numpy(obs)
acs = th.from_numpy(acs)
next_obs = th.from_numpy(next_obs)
rews = th.from_numpy(rews)
dones = th.from_numpy(dones)

next_acs = agent.policy.get_action(next_obs)

(244,)

### Validation

For validation of the reproducing results, we run the trainer on the [Multi-Agent Partical Environment](https://github.com/openai/multiagent-particle-envs)

In [24]:
from pettingzoo.mpe import simple_adversary_v2

In [25]:
env = simple_adversary_v2.env(N=2, max_cycles=25, continuous_actions=False)
env.reset(seed=42)
env.observation_space('adversary_0')

Box(-inf, inf, (8,), float32)

In [28]:
env.action_space('adversary_0').sample()

4

In [30]:
env.observe(env.agents[0])

array([ 0.69437176, -1.4609686 ,  0.023015  , -0.97559977,  0.51697916,
       -1.5288411 ,  1.0734879 , -0.19491644], dtype=float32)