# Multi-Agent Deep Deterministic Policy Gradient for Stock Market

## Stock Market Environment

- __Hyperparameters__
- __Observation Space__
  - `stock_price`: `ndarray` of shape $[N_{stock}, ]$
  - `correlated_stock`: `ndarray` of shape $[N_{correlated}, ]$
  - `uncorrelated_stock`: `ndarray` of shape $[N_{uncorrelated}, ]$
  - `budgets`: `ndarray` of shape $[N_{agents}, ]$
  - `shares_held`: `ndarray` of shape $[N_{agents}, ]$
  - `agent_views`: `ndarray` of shape $[N_{agents}, N_{stock}]$
  - `company_states`: `ndarray` of shape $[N_{company}, ]$
- __Action Space__
  - dimension_1: log buy/sell prices $\log p\in\left(-\infty, +\infty\right)$ => `gym.spaces.Box`
  - dimension_2: discrete shares $s\in\mathbb{N}$ => `gym.spaces.Discrete`


In [35]:
from typing import Dict, Optional, Sequence, Tuple, Union

import numpy as np
from gym.core import ActType, ObsType, Env
from gym.spaces import Box, MultiDiscrete, Tuple as TupleSpace


class StockMarketEnv(Env):
    
    def __init__(self,
                 num_agents: int,
                 budget_discount: float = 0.9,
                 num_company: int = 5,
                 num_correlated_stocks: int = 19,
                 num_uncorrelated_stocks: int = 10,
                 max_shares: int = 100000,
                 start_prices: Union[float, Sequence[float]] = 100.0,
                 min_budget: float = 100.0,
                 max_budget: float = 10000.0,
                 budget_discount: float = 0.9,
                 step_size: float = 1.0,
                 price_std: float = 100.0,
                 noise_std: float = 10.0,
                 seed: int = 0) -> None:
        super().__init__()

        # Agent Parameters
        self.num_agents = num_agents
        self.num_company = num_company
        self.min_budget = min_budget
        self.max_budget = max_budget
        self.budget_discount = budget_discount
        self.max_shares = max_shares

        # Stock Market Parameters
        self.dt = step_size
        self.start_prices = start_prices
        self.price_std = price_std
        self.noise_std = noise_std

        # Observation and Action spaces
        self.n_correlated_stocks = num_correlated_stocks
        self.n_uncorrelated_stocks = num_uncorrelated_stocks
        self.n_stocks = num_correlated_stocks + num_uncorrelated_stocks + 1
        self.observation_space = Box(low=0.0,
                                     high=float("inf"),
                                     shape=(self.num_agents, self.n_stocks))
        self.action_space = TupleSpace(
            (Box(low=-float("inf"),
                 high=float("inf"),
                 shape=(self.num_agents, self.n_stocks)),
             MultiDiscrete([[max_shares] * self.n_stocks] * self.num_agents))
        )
        self._seed = seed

    def reset(self,
              seed: Optional[int] = None,
              return_info: bool = True) -> Tuple[ObsType, Dict]:
        self.rng = np.random.default_rng(seed=seed or self._seed)        

        correlated_stocks = np.clip(
            np.random.normal(loc=self.start_prices,
                             scale=self.price_std,
                             size=(self.n_correlated_stocks, )),
            a_min=1, a_max=None
        )
        uncorrelated_stocks = np.clip(
            np.random.normal(loc=self.start_prices,
                             scale=self.price_std,
                             size=(self.n_uncorrelated_stocks,)),
            a_min=1, a_max=None
        )
        self.eta = np.clip(
            np.random.normal(loc=1.5, scale=1.5, size=(self.num_agents, )),
            a_min=0, a_max=10
        )
        self.valid_mask = np.zeros(shape=(self.num_agents, self.n_stocks),
                                   dtype="bool")
        self.valid_mask[:, 1:1+self.n_correlated_stocks] = True
        self.valid_mask[self.rng.integers(low=0, high=self.num_agents),
                        1 + self.n_correlated_stocks:] = True

        self.prices = np.asarray(self.start_prices)
        self.budgets = self.min_budget + self.rng.random(
            size=(self.num_agents), dtype="float32") * (
                self.max_budget - self.min_budget)
        self.shares = self.rng.integers(low=1,
                                        high=self.max_shares,
                                        size=(self.num_agents, self.n_stocks))

        return (self.prices,
                {
                    "correlated_stocks": correlated_stocks,
                    "uncorrelated_stocks": uncorrelated_stocks,
                    "budgets": self.budgets,
                    "shares": self.shares,
                    "valid_mask": self.valid_mask,
                    "company_states": None  # TODO: Company states
                })
    
    def is_terminated(self) -> bool:
        return True

    def step(self, action: Tuple[np.ndarray, np.ndarray]) -> Tuple:
        assert (len(action) == 2 and
                action[0].shape == (self.num_agents, self.n_stocks) and
                action[1].shape == (self.num_agents, self.n_stocks))
        # TODO
        proposed_prices = 1. + np.exp(action[0])
        proposed_shares = action[1]

        # Update budgets and shares
        potential_budgets = self.budgets + \
            (proposed_prices * (-proposed_shares)).sum(-1)
        potential_shares = self.shares + proposed_shares
        print("Current budgets: \n", potential_budgets,
              "\nCurrent shares: \n", potential_shares)
        rewards = np.where(
            np.logical_or(potential_budgets < 0.0,
                          np.any(potential_shares < 0.0, axis=-1)),
            -100, 0.0
        )
        print("Rewards", rewards)
        curr_prices = self.prices
        
        # TODO


        # TODO

        dones = self.is_terminated()
        if dones:
            next_s, _ = self.reset()

        return 

    @staticmethod
    def utility(c: float, eta: float) -> float:
        if eta!= 1.0:
            return (c ** (1.0 - eta) - 1.0) / (1.0 - eta)
        else:
            return np.log(c)

In [36]:
env = StockMarketEnv(10)
env.reset()
random_action = env.action_space.sample()

env.step(random_action)

Current budgets: 
 [-27991.42362094 -34178.41267729 -35891.77252471 -37679.80823529
 -28668.28449202 -30585.09359443 -38112.77494562 -39650.0941211
 -34963.18385458 -40977.61369228] 
Current shares: 
 [[1484 1084  874 1722  790  738  632  981 1233  964 1773 1594  170  739
  1737 1070  954  836 1193 1766  763  630 1780  100  614  867  905  656
   694  837]
 [ 901   65  285  760  834  718 1305 1028  896 1513 1300  732 1520 1772
  1731  940  780 1930 1643 1055  899 1620  485 1600 1024 1432 1497  851
   842 1002]
 [ 680  979 1288 1587 1220  662 1828  627 1331 1392  657  706 1110 1577
   886 1109  394  844 1159  640 1636  907  771 1453 1416   84  534  670
  1154 1186]
 [1769  664 1006  804 1715  909  749 1240  650 1330  673 1422 1099 1214
  1009 1681  948  973  696  943  891 1376  603 1633  437 1555 1536 1478
   885 1355]
 [1624 1135  920 1581 1328  710  748 1583  378  787  974  853  995 1090
   681  976 1628  836  930  425 1392 1659 1175  384 1946  692 1394   18
   644 1181]
 [1590 1305  8

(None, None)

---

## MADDPG Trainer

The `MADDPG Trainer` class is a generic version of the `DDPG` trainer initialized with
- A sequence of `DDPG Agent` class objects
- A shared observation buffer.

In [1]:
from __future__ import annotations

from copy import deepcopy
from typing import Any, Optional, Sequence, Tuple, Union

import numpy as np
import torch as th
from numpy import ndarray
from pettingzoo.mpe import simple_adversary_v2
from src.memory.multi_replay_buffer import MultiAgentReplayBuffer
from torch import Tensor, nn, optim
from torch.nn import functional as F

device = th.device('cuda:0')

In [2]:
# Model
class PolicyNet(nn.Module):

    def __init__(self,
                 in_features: int,
                 action_size: int,
                 num_hidden_1: int = 400,
                 num_hidden_2: int = 300,
                 negative_slope: float = 0.01) -> None:
        super().__init__()

        self.linear_1 = nn.Linear(in_features, num_hidden_1)
        self.linear_2 = nn.Linear(num_hidden_1, num_hidden_2)
        self.linear_3 = nn.Linear(num_hidden_2, action_size)
        self.neg_slope = negative_slope

        self.reset_parameters()
    
    def forward(self, obs: Tensor) -> Tensor:
        obs = obs.float()
        obs = F.leaky_relu(self.linear_1(obs), self.neg_slope)
        obs = F.leaky_relu(self.linear_2(obs), self.neg_slope)
        acs = th.tanh(self.linear_3(obs))
        
        return acs
    
    def reset_parameters(self) -> None:
        gain_lrelu = nn.init.calculate_gain('leaky_relu')
        gain_tanh = nn.init.calculate_gain('tanh')
        nn.init.xavier_uniform_(self.linear_1.weight, gain=gain_lrelu)
        nn.init.xavier_uniform_(self.linear_2.weight, gain=gain_lrelu)
        nn.init.xavier_uniform_(self.linear_3.weight, gain=gain_tanh)


class CriticNet(nn.Module):
    
    def __init__(self,
                 obs_in_features: int,
                 acs_in_features: int,
                 num_hidden_1: int = 400,
                 num_hidden_2: int = 300,
                 negative_slope: float = 0.01) -> None:
        super().__init__()

        self.linear_1 = nn.Linear(obs_in_features, num_hidden_1)
        self.linear_2 = nn.Linear(num_hidden_1 + acs_in_features, num_hidden_2)
        self.linear_3 = nn.Linear(num_hidden_2, 1)
        self.neg_slope = negative_slope

    def forward(self, obs: Tensor, acs: Tensor) -> Tensor:
        obs = obs.float()
        acs = acs.float()

        obs = F.leaky_relu(self.linear_1(obs), self.neg_slope)
        q_val = F.leaky_relu(self.linear_2(th.cat([obs, acs], -1)),
                             self.neg_slope)
        q_val = self.linear_3(q_val)

        return q_val
    
    def reset_parameters(self) -> None:
        gain = nn.init.calculate_gain('leaky_relu')
        nn.init.xavier_uniform_(self.linear_1.weight, gain)
        nn.init.xavier_uniform_(self.linear_2.weight, gain)
        nn.init.xavier_uniform_(self.linear_3.weight, gain)

In [3]:
env = simple_adversary_v2.parallel_env(max_cycles=25)
env.reset()
# Initialize agents
def hard_update(src: nn.Module,
                tar: nn.Module,
                non_blocking: bool = True) -> None:
    with th.no_grad():
        for param, tar_param in zip(src.parameters(), tar.parameters()):
            param.data.copy_(tar_param, non_blocking)

def soft_update(src: nn.Module,
                tar: nn.Module,
                tau: float = 0.001,
                non_blocking: bool = True) -> None:
    with th.no_grad():
        for param, tar_param in zip(src.parameters(), tar.parameters()):
            param.data.copy_(param * tau + tar_param * (1 - tau))

policy_nets, critic_nets = {}, {} 
policy_tar_nets, critic_tar_nets = {}, {}
policy_opts, critic_opts = {}, {}
global_obs_size, global_acs_size = 0, 0
for agent in env.agents:
    if len(env.observation_space(agent).shape) > 2:
        raise RuntimeError('Image inputs not supported')
    global_obs_size += env.observation_space(agent).shape[0]
    global_acs_size += env.action_space(agent).n

for agent in env.agents:
    policy_nets[agent] = PolicyNet(env.observation_space(agent).shape[0],
                                   env.action_space(agent).n).to(device)
    policy_tar_nets[agent] = PolicyNet(env.observation_space(agent).shape[0],
                                       env.action_space(agent).n).to(device)
    hard_update(policy_tar_nets[agent], policy_nets[agent])
    critic_nets[agent] = \
        CriticNet(global_obs_size, global_acs_size).to(device)
    critic_tar_nets[agent] = \
        CriticNet(global_obs_size, global_acs_size).to(device)
    hard_update(critic_tar_nets[agent], critic_nets[agent])
    policy_opts[agent] = optim.Adam(policy_nets[agent].parameters(), lr=1e-4)
    critic_opts[agent] = \
        optim.Adam(critic_nets[agent].parameters(), lr=1e-3, weight_decay=1e-2)

buffer = MultiAgentReplayBuffer(env.agents)

In [4]:
# Hyperparameters
batch_size: int = 64
discount: float = 0.99
max_episode_step: int = 500
num_episodes: int = 2000
num_warm_up: int = 400

In [13]:
# Train Loop
# =========================================
n_agents = len(env.agents)
agent_rews = np.empty(shape=(num_episodes, n_agents), dtype='float32')
episode_rews = np.empty(shape=(num_episodes, 1), dtype='float32')
env_step: int = 0

def to_one_hot(data: int, num_classes: int = -1) -> np.ndarray:
    if num_classes == -1:
        num_classes = int(max(data) + 1)

    if isinstance(data, int):
        output = np.zeros(shape=(num_classes,))
        output[data] = 1
    else:
        output = data

    return output

    
for episode in range(num_episodes):
    ob_n = env.reset()
    while env.agents:
        env_step += 1
        if env_step <= num_warm_up:
            actions = {_a: env.action_space(_a).sample() for _a in env.agents}
            ac_n = {_a: to_one_hot(ac, env.action_space(_a).n)
                    for _a, ac in actions.items()}
        else:
            actions, ac_n = {}, {}
            for agent, ob in ob_n.items():
                ob = th.from_numpy(ob).view(1, -1).float().to(device)
                ac = F.gumbel_softmax(policy_nets[agent].forward(ob))
                actions[agent] = ac.view(-1).argmax().item()
                ac_n[agent] = ac.detach().cpu().numpy()
            
        next_ob_n, rew_n, done_n, _, _ = env.step(actions)
        buffer.add_transition(ob_n, ac_n, next_ob_n, rew_n, done_n)

        # Learn
        if len(buffer) > batch_size:
            for _id in env.agents:
                obs_n, acs_n, next_obs_n, rew_n, dones_n = \
                    buffer.sample(batch_size, random=True, device=device) 

                # Centralized observation
                states = th.hstack(list(obs_n.values()))
                next_states = th.hstack(list(next_obs_n.values()))
                actions = th.hstack(list(acs_n.values()))
                next_actions = th.hstack(
                    [F.gumbel_softmax(
                        policy_tar_nets[_id].forward(next_obs_n[_id]),
                        hard=True
                    ).detach()
                            for _id in env.agents]
                )

            
                ob, ac, next_ob, rew, done = (
                    obs_n[_id],
                    acs_n[_id],
                    next_ob_n[_id],
                    rew_n[_id],
                    dones_n[_id]
                )                

                # Update critic network
                rew = rew.view(-1, 1)
                done = done.view(-1, 1).to(device)
                actions = th.hstack(list(acs_n.values())).to(device)
                q_val = critic_nets[_id].forward(states, actions)
                tar_q_val = rew + discount * (1 - done) * \
                    critic_tar_nets[_id](next_states, next_actions)
                critic_opts[_id].zero_grad()
                closs = F.mse_loss(q_val, tar_q_val.detach(), reduction='mean')
                closs.backward()
                critic_opts[_id].step()

                # Update policy network
                ob = ob.to(device)
                new_logits = policy_nets[_id].forward(ob)
                new_action = F.gumbel_softmax(new_logits, hard=True)
                acs_n[_id] = new_action
                policy_opts[_id].zero_grad()
                aloss = -critic_nets[_id].forward(
                    states, th.hstack(list(acs_n.values())))
                policy_opts[_id].step()


KeyboardInterrupt: 

In [6]:
logits = policy_nets['agent_0'].forward(th.from_numpy(ob).to(device))
F.gumbel_softmax(logits, hard=True)

NameError: name 'ob' is not defined

In [3]:
th.cuda.empty_cache()

---

# Stock Market Evaluator

In [1]:
from src.environment.stock_market import (
    LogarithmAndIntActionWrapper, StockMarketEnv)

In [2]:
env = LogarithmAndIntActionWrapper(
    StockMarketEnv(num_agents=5))
env.reset()

{'agent_0': array([  0.        ,  98.11713208,  46.73370522,   1.        ,
         82.78770243, 153.09555082, 211.81191222,  74.52338782,
          1.        , 166.1772597 ,  29.72760639,  31.73528957,
        168.5467816 , 124.80253407, 117.71429121, 187.30074633,
        131.34482528,  64.78192649, 174.03601525, 337.67939808,
        110.22890678, 155.31159852,  10.07443286, 171.23993985,
         90.62127185,   1.        , 150.7712587 , 169.39050963,
        263.788417  , 207.89874706]),
 'agent_1': array([  0.        ,  98.11713208,  46.73370522,   1.        ,
         82.78770243, 153.09555082, 211.81191222,  74.52338782,
          1.        , 166.1772597 ,  29.72760639,  31.73528957,
        168.5467816 , 124.80253407, 117.71429121, 187.30074633,
        131.34482528,  64.78192649, 174.03601525, 337.67939808,
        110.22890678, 155.31159852,  10.07443286, 171.23993985,
         90.62127185,   1.        , 150.7712587 , 169.39050963,
        263.788417  , 207.89874706]),
 'agen