In [6]:
import random
import gym
from gym import spaces
import pandas as pd
import numpy as np

import torch

import json
import datetime as dt
# from stable_baselines3.common.policies import MlpPolicy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO

In [7]:
N_DISCRETE_ACTIONS = 0

class CustomEnv(gym.Env):
    """Custom env in line with gym interface"""
    metadata = {'render.modes': ['human']}

    def __init__(self, arg1, arg2):
        super(CustomEnv, self).__init__()

        # Defining action and observation spaces (must be gym.spaces objects)
        self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS) # discrete actions
        self.observation_space = spaces.Box(low=0, high=255, shape=(128,128,3), dtype=np.uint8) # image rgb

    def step(self, action):
        """Execute one time step within the environment"""
        pass

    def reset(self):
        """Reset the state of the environment to an initial state"""
        pass

    def render(self, mode="human", close=False):
        """Render environment to screen"""
        pass

    


In [8]:
"""
Obs Space:
(open price, high, low, close, and daily volume) for the last five days, 
as well a couple other data points like its account balance, current stock positions, and current profit.

Action Space:
buy, sell, nothing
+ amounts (0-100%)

Reward:
incentivise sustained profits
account bal * %ofCurrentTimeStep
"""

MAX_ACCOUNT_BALANCE = 1e7
INITIAL_ACCOUNT_BALANCE = 1e4
MAX_SHARE_PRICE = 1e4
MAX_NUM_SHARES = 2e8
MAX_STEPS = 1e6

class StockTradingEnv(gym.Env):
    """Stock Trading Env in line with gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df: pd.DataFrame):
        """ Agent for trading
        Args:
            df (pd.DataFrame): the pandas dataframe containing the stock data to be learned from
        """
        super(StockTradingEnv, self).__init__()

        self.df = df
        self.reward_range = (0, MAX_ACCOUNT_BALANCE)

        # Action Space: Buy x%, Sell x%, Hold
        self.action_space = spaces.Box(low=np.array([0,0]), high=np.array([3,1]), dtype=np.float16) 

        # Observation Space: OHCL values for last five prices
        self.observation_space = spaces.Box(low=0, high=6, shape=(6,6), dtype=np.float16)

    def step(self, action):
        """Execute one time step within the environment"""
        # Execute one time step within the environment
        self._take_action(action)
        self.current_step += 1

        if self.current_step > len(self.df.loc[:, 'Open'].values) - 6:
            self.current_step = 0

        delay_modifier = (self.current_step / MAX_STEPS)

        reward = self.balance * delay_modifier
        done = self.net_worth <= 0
        obs = self._next_observation()

        return obs, reward, done, {}

    def reset(self):
        """Reset the state of the environment to an initial state"""
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0

        # Set the current step to a random point within the data frame
        self.current_step = random.randint(0, len(self.df.loc[:, 'Open'].values) - 6)

        return self._next_observation()

    def _next_observation(self):
        # Get the data points for the last 5 days and scale to between 0-1
        frame = np.array([
            self.df.loc[self.current_step: self.current_step + 5, 'Open'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step + 5, 'High'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step + 5, 'Low'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step + 5, 'Close'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step + 5, 'Volume'].values / MAX_NUM_SHARES,
        ])
        # Append additional data and scale each value to between 0-1
        obs = np.append(frame, [[
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.shares_held / MAX_NUM_SHARES,
            self.cost_basis / MAX_SHARE_PRICE,
            self.total_shares_sold / MAX_NUM_SHARES,
            self.total_sales_value / (MAX_NUM_SHARES * MAX_SHARE_PRICE),
        ]], axis=0)

        return obs
    
    def _take_action(self, action):
        # Set the current price to a random price within the time step
        current_price = random.uniform(self.df.loc[self.current_step, "Open"], self.df.loc[self.current_step, "Close"])
        action_type = action[0]
        amount = action[1]

        if action_type < 1:
            # Buy amount % of balance in shares
            total_possible = self.balance / current_price
            shares_bought = total_possible * amount
            prev_cost = self.cost_basis * self.shares_held
            additional_cost = shares_bought * current_price
            self.balance -= additional_cost
            self.cost_basis = (prev_cost + additional_cost) / (self.shares_held + shares_bought)
            self.shares_held += shares_bought

        elif action_type < 2:
            # Sell amount % of shares held
            shares_sold = self.shares_held * amount
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_sold += shares_sold
            self.total_sales_value += shares_sold * current_price
            self.net_worth = self.balance + self.shares_held * current_price

        if self.net_worth > self.max_net_worth:
            self.max_net_worth = self.net_worth

        if self.shares_held == 0:
            self.cost_basis = 0

    def render(self, mode="human", close=False):
        """Render environment to screen"""
        profit = self.net_worth - INITIAL_ACCOUNT_BALANCE
        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(f'Shares held: {self.shares_held} (Total sold: {self.total_shares_sold})')
        print(f'Avg cost for held shares: {self.cost_basis} (Total sales value: {self.total_sales_value})')
        print(f'Net worth: {self.net_worth} (Max net worth: {self.max_net_worth})')
        print(f'Profit: {profit}')

In [11]:
df = pd.read_csv('AAPL.csv')
df = df.sort_values('Date')

# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: StockTradingEnv(df)])

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=20000)

obs = env.reset()

for i in range(2000):
  action, _states = model.predict(obs)
  obs, rewards, done, info = env.step(action)
  env.render()

  sample[upp_bounded] = (


Using cuda device


  self.cost_basis = (prev_cost + additional_cost) / (self.shares_held + shares_bought)


-----------------------------
| time/              |      |
|    fps             | 432  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 362          |
|    iterations           | 2            |
|    time_elapsed         | 11           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0015124652 |
|    clip_fraction        | 0.00269      |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.85        |
|    explained_variance   | -5.91e-05    |
|    learning_rate        | 0.0003       |
|    loss                 | 8.16e+03     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00302     |
|    std                  | 1.01         |
|    value_loss           | 2.34e+04     |
----------------