#### Version History
* v1: Just testing a few iterations over the 500,000 sample dataset
* v2: Built custom feature extractor and 2,000,000 timesteps (Timed out)
* v3: Decreasing timesteps to 1,800,000 (Timed out)
* v4: Decreasing timesteps to 1,000,000 (Timed out)
* v5: Decreasing timesteps to   500,000 (Timed out)
* v6: Decreasing timesteps to   500,000 with GPU

#### TODO:
1. [x] Build custom feature extractor
2. [ ] Build LR Scheduler
3. [ ] Try out differnet net_archs

#### Links
* [github](https://github.com/DLR-RM/stable-baselines3)
* [readdocs.io](https://stable-baselines3.readthedocs.io/en/master/)
* [readdocs.io (pdf)](https://stable-baselines3.readthedocs.io/_/downloads/en/master/pdf/)
* [openai docs](https://openai.com/blog/openai-baselines-ppo/)
* [detailed snake example](https://pythonprogramming.net/custom-environment-reinforcement-learning-stable-baselines-3-tutorial/)

In [None]:
!pip install -q stable_baselines3

In [None]:
import pandas as pd
import numpy as np

import math

import gym
from gym import spaces

from stable_baselines3 import DQN

import matplotlib.pyplot as plt

from collections import deque

from typing import Any, Dict, List, Optional, Type

import gym
import torch as th
from torch import nn

from stable_baselines3.common.policies import BasePolicy
from stable_baselines3.common.torch_layers import (
    BaseFeaturesExtractor,
    CombinedExtractor,
    FlattenExtractor,
    NatureCNN,
    create_mlp,
)

from stable_baselines3.common.type_aliases import Schedule

In [None]:
class CONFIG():
    DEVICE = th.device("cuda") if th.cuda.is_available() else th.device("cpu")
    BS = 512
    N_ACTIONS = 3
    SEQ_LEN = 15
    EPOCHS = 20
    CASH = 25_000  # in USD
    LR = 1e-3
    GAMMA = 0.99
    EPS = 1.
    EPS_DEC = .9999
    EPS_MIN = 1e-2
    MEM_SIZE = 1000
    REPLACE_CNT = 1000
    N_FEATS = 76
    N_SHARES = 100
    MIN_SHARES = 0
    MAX_SHARES = 300
    CASH_BUFFER = 500  # in USD
    HIDDEN_SZ = 200
    NLAYERS = 2
    SEQ_LEN = 1

In [None]:
class MartketFeatures(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(MartketFeatures, self).__init__(observation_space, features_dim)
        emb_dim = 0
        for _,i in EMB_DIMS:
            emb_dim += i
        n_cat_feats = 15
        input_dim = observation_space.shape[1] + emb_dim - n_cat_feats
        self.emb_layers = nn.ModuleList([nn.Embedding(i,j) for i,j in EMB_DIMS])

        self.gru = nn.GRU(input_dim, features_dim, CONFIG.NLAYERS, batch_first=True)
        self.hn = th.zeros((CONFIG.NLAYERS, CONFIG.SEQ_LEN, features_dim), device=CONFIG.DEVICE)

        self.linear = nn.Sequential(nn.Linear(features_dim,features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        observations = observations
        cats = observations[:,:,:15].long()
        conts = observations[:,:,15:].squeeze(1)
        
        emb = [emb_layer(cats[:,:,i]).squeeze(1) for i,emb_layer in enumerate(self.emb_layers)]

        emb = th.cat(emb,1)
        
        gru_inp = th.cat([emb, conts],1).float().unsqueeze(0)
        output, self.hn = self.gru(gru_inp, self.hn.detach())
        
        output = output.squeeze(0)
        return self.linear(output)

In [None]:
class MarketEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, df, closing_prices):
        super(MarketEnv, self).__init__()
        self.action_space = spaces.Discrete(CONFIG.N_ACTIONS)  # {Buy: 0, Sell: 1, Hold: 2}
        self.observation_space = spaces.Box(low=0, high=1, shape=((1,CONFIG.N_FEATS)))
        self.closing_series = closing_prices
        self.closing_iter = closing_price.iteritems()
        _,self.close = next(self.closing_iter)
        self.historicals = df
        self.historicals_iter = df.itertuples(index=False)
        self.observation = th.tensor(next(self.historicals_iter)).unsqueeze(0)
        
        self.orders = deque(maxlen=60) # remembers the last x trades

        self.cash = CONFIG.CASH
        self.shares_owned = 0
        self.prev_close = self.close
        
        self.plot_init = False

    def step(self, action):
        reward = 0
        done = False
        info = {}

        assets_worth_prev = self.cash + self.shares_owned * self.prev_close
        assets_worth_now = self.cash + self.shares_owned * self.close
        self.value = assets_worth_now
        
        def reward_calc(x):
            return 1 / (1 + math.exp(-x/100))
        
        if action == 0:  # Buy
            if self.shares_owned < CONFIG.MAX_SHARES:
                if self.close * CONFIG.N_SHARES + CONFIG.CASH_BUFFER < self.cash:
                    self.cash -= self.close * CONFIG.N_SHARES
                    self.shares_owned += CONFIG.N_SHARES
                    self.orders.append(-1 * self.close)
                    reward = reward_calc(sum(self.orders))
                else:
                    done = True
                    reward = -1
            else:
                reward = -.2
        elif action == 1: # Sell
            if self.shares_owned > CONFIG.MIN_SHARES:
                self.cash += CONFIG.N_SHARES * self.close
                self.shares_owned -= CONFIG.N_SHARES
                self.orders.append(self.close)
                reward = reward_calc(sum(self.orders))
            else:
                reward = -1
        else:
            reward = -.1

        self.prev_close = self.close
        
        try:
            _,self.close = next(self.closing_iter)
            self.observation = th.tensor(next(self.historicals_iter)).unsqueeze(0)
        except StopIteration:
            done = True
        
        return self.observation, reward, done, info

    def reset(self):
        self.cash = CONFIG.CASH
        self.shares_owned = 0
        self.historicals_iter = self.historicals.itertuples(index=False)
        self.closing_iter = self.closing_series.iteritems()
        _,self.close = next(self.closing_iter)
        self.observation = th.tensor(next(self.historicals_iter)).unsqueeze(0)
        return self.observation
  
    def render(self, assets, baseline):
        baseline = [x*CONFIG.MAX_SHARES for x in baseline]
        plt.figure(figsize=(35,10))
        plt.plot(np.linspace(0,len(assets),len(assets)), assets, color='red', label='Test')
        plt.plot(np.linspace(0,len(baseline),len(baseline)), baseline, color='gray', label='Baseline');
        plt.title('Trained Models vs Baseline Buy and Hold')
        plt.xlabel('Minutes')
        plt.xlim(xmin=-3)
        plt.ylabel('Account Value')
        plt.legend(bbox_to_anchor=(1.0, 1.03), loc='upper left')
        plt.show()

In [None]:
from tqdm.notebook import tqdm
from stable_baselines3.common.callbacks import BaseCallback

class ProgbarCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.progress_bar = None
    
    def _on_training_start(self):
        self.progress_bar = tqdm(total=self.locals['total_timesteps'])
    
    def _on_step(self):
        self.progress_bar.update(1)
        return True

    def _on_training_end(self):
        self.progress_bar.close()
        self.progress_bar = None

In [None]:
def categorify(df,split):
    df = df.astype('int')
    cats = df.columns.tolist()
    
    embs = {}
    
    for col in cats:
        embs[col] = df[col].unique().tolist()
        embs[col].sort()

    for key in embs:
        for idx,val in enumerate(embs[key]):
            df[key].replace(val,idx,inplace=True)
            
    df = df.astype('category')
    return df

In [None]:
df = pd.read_pickle("../input/historicals/first_500000.pkl").iloc[:,:-4]
df.iloc[:,:15] = categorify(df.iloc[:,:15],15)

cat_nuniq = [df[col].nunique() for col in df.iloc[:,:15]]
EMB_DIMS = [(x, min(50, (x + 1) // 2)) for x in cat_nuniq]

closing_price = df.loc[:,'close_spy']
cats = df.iloc[:,:15]
conts = df.iloc[:,15:]

mean = conts.mean()
std = conts.std()
normalized_df = (conts-mean)/std
combined_df = pd.concat((cats,normalized_df),axis=1)

env = MarketEnv(combined_df, closing_price)

tqdm_callback = ProgbarCallback()

policy_kwargs = dict(net_arch=[64,64],
                     features_extractor_class=MartketFeatures,
                     features_extractor_kwargs=dict(features_dim=256),
                     normalize_images=False,
                    )

model = DQN("MlpPolicy",
            env,
            learning_rate=CONFIG.LR,
            learning_starts=2400,
            batch_size=CONFIG.BS,
            train_freq=480,
            target_update_interval=7200,
            exploration_fraction=0.2,
            exploration_initial_eps=1.,
            exploration_final_eps=.01,
            gamma=CONFIG.GAMMA,
            policy_kwargs=policy_kwargs,
            verbose=0,
            seed=42,
            device=CONFIG.DEVICE,
            _init_setup_model=True)

In [None]:
%%time
model.learn(total_timesteps=500_000, callback=tqdm_callback)

In [None]:
model.save("PPO_Market_Bot_GPU")
# model = DQN.load("PPO_Market_Bot")
obs = env.reset()
assets = [None] * 3_000_000
closing = [None] * 3_000_000
actions = [None] * 3_000_000
idx = 0
for _ in range(500_000):
    action, _states = model.predict(obs)
    obs, reward, done, info = env.step(action)
    assets[idx] = env.value
    closing[idx] = env.close
    actions[idx] = action
    idx += 1
assets = [x for x in assets if x!=None]
closing = [x for x in closing if x!=None]
actions = [x for x in actions if x!=None]
env.render(assets, closing)

In [None]:
string1 = "Shares Owned:  "
string2 = "Closing Price: $"
string3 = "Leftover Cash: $"
string4 = "Shares Final Value: $"
string5 = "Total Final Value of all Assets: $"
print(f"{string1:>35}{env.shares_owned:>11}\n{string2:>35}{env.close:>11,.2f}\n{string3:>35}{env.cash:>11,.2f}\n{string4:>35}{env.shares_owned * env.close:>11,.2f}\n{string5:>35}{env.value:>11,.2f}\n")

t = pd.Series(actions, dtype=int)
for unique in t.unique().tolist():
    if unique==0:
        print(f" Buy Orders: {len(t[t==unique]):>6,}")
    elif unique==1:
        print(f"Sell Orders: {len(t[t==unique]):>6,}")
    else:
        print(f"       Pass: {len(t[t==unique]):>6,}")