<a href="https://colab.research.google.com/github/aCStandke/ReinforcementLearning/blob/main/SecondStockEnivornment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install stable-baselines3[extra]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import random
import json
import gym
from gym import spaces
from gym.utils import seeding
import pandas as pd
import numpy as np
import json
import datetime as dt
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import collections
import datetime

In [4]:
# Stock Environment Parameters
MAX_ACCOUNT_BALANCE = 2147483647
MAX_NUM_SHARES = 2147483647
MAX_SHARE_PRICE = 5000
MAX_OPEN_POSITIONS = 5
MAX_STEPS = 20000
TRADING_DAYS = 5
DEFAULT_COMMISSION_PERC = 0.1
INITIAL_ACCOUNT_BALANCE = 10000

In [5]:
# Stock/ETF Trading Enviornment
class StockTradingEnv(gym.Env):
    """A stock trading environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, data, random_ofs_on_reset=True):
        super(StockTradingEnv, self).__init__()

        self.data = data
        self.reward_range = (0, MAX_ACCOUNT_BALANCE)
        self.random_ofs_on_reset = random_ofs_on_reset
        self.bars_count = TRADING_DAYS

        # Actions of the format Buy x%, Sell x%, Hold, etc.
        self.action_space = spaces.Box(
            low=np.array([0, 0]), high=np.array([3, 1]))

        # Prices contains the OHCL values for the last five prices
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(37,))
        
        self.random_ofs_on_reset = random_ofs_on_reset
        self.seed()

    def reset(self):
      bars = self.bars_count
      if self.random_ofs_on_reset:
        offset = self.np_random.choice(self.data.high.shape[0]-bars*10)+bars
      else:
        offset = bars
      self._reset(offset)
      return self._next_observation()  

    @property
    def shape(self):
      return (5*self.bars_count+6, )

    def _next_observation(self):
        # Get the stock data points for the last 5 days and scale to between 0-1
        res = np.ndarray(shape=self.shape, dtype=np.float32)
        shift = 0
        for bar_idx in range(-self.bars_count+1, 1):
          res[shift] = self.data.open[self._offset + bar_idx]
          shift += 1
          res[shift] = self.data.high[self._offset + bar_idx]
          shift += 1
          res[shift] = self.data.low[self._offset + bar_idx]
          shift += 1
          res[shift] = self.data.close[self._offset + bar_idx]
          shift += 1
          res[shift] = self.data.volume[self._offset + bar_idx]
          shift += 1

        # Append additional data and scale each value to between 0-1
        obs = np.append(res, [[
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.shares_held / MAX_NUM_SHARES,
            self.cost_basis / MAX_SHARE_PRICE,
            self.total_shares_sold / MAX_NUM_SHARES,
            self.total_sales_value / (MAX_NUM_SHARES * MAX_SHARE_PRICE),
        ]])

        return obs

    def _take_action(self, action):
        # Set the current price 
        current_price = self._cur_close()
        action_type = action[0]
        amount = action[1]

        if action_type < 1:
            # Buy amount % of balance in shares
            total_possible = int(self.balance / current_price)
            shares_bought = int(total_possible * amount)
            prev_cost = self.cost_basis * self.shares_held
            additional_cost = shares_bought * current_price

            self.balance -= additional_cost
            self.cost_basis = (
                prev_cost + additional_cost) / (self.shares_held + shares_bought)
            self.shares_held += shares_bought

        elif action_type < 2:
            # Sell amount % of shares held
            shares_sold = int(self.shares_held * amount)
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_sold += shares_sold
            self.total_sales_value += shares_sold * current_price

        self.net_worth = self.balance + self.shares_held * current_price

        if self.net_worth > self.max_net_worth:
            self.max_net_worth = self.net_worth

        if self.shares_held == 0:
            self.cost_basis = 0

    def _cur_close(self):
      """
      Calculate real close price for the current bar
      """
      open = self.data.open[self._offset]
      rel_close = self.data.close[self._offset]
      return open * (1.0 + rel_close)

    def step(self, action):
        # Execute one time step within the environment
        self._take_action(action)

        self.current_step += 1

        if self.current_step > len(self.data.open[:]) - 6:
            self.current_step = 0

        #delay_modifier = (self.current_step / MAX_STEPS)
        # reward = self.balance * delay_modifier
        reward = self.balance
        done = self.net_worth <= 0

        obs = self._next_observation()

        return obs, reward, done, {}

    def _reset(self, offset):
        # Reset the state of the environment to an initial state
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0
        self._offset = offset

        # Set the current step to a random point within the data frame
        self.current_step = random.randint(0, len(self.data.open[:]) - 6)


    def render(self, mode='human', close=False):
      # Render the environment to the screen
      profit = self.net_worth - INITIAL_ACCOUNT_BALANCE
      print(f'Step: {self.current_step}')
      print(f'Balance: {self.balance}')
      print(f'Shares held: {self.shares_held} (Total sold: {self.total_shares_sold})')
      print(f'Avg cost for held shares: {self.cost_basis} (Total sales value: {self.total_sales_value})')
      print(f'Net worth: {self.net_worth} (Max net worth: {self.max_net_worth})')
      print(f'Profit: {profit}')


    def seed(self, seed=None):
      self.np_random, seed1 = seeding.np_random(seed)
      seed2 = seeding.hash_seed(seed1+1) % 2**33
      return [seed1, seed2]


In [6]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/StockMarketData/archive/Data/ETFs/spy.us.txt')
df = df.sort_values('Date')
data=df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]

# year data of year-month-day form
dt = data['Date'].array
# calculating relative prices 
rh = (data['High'].values-data['Open'].values)/data['Open'].values
rl = (data['Low'].values-data['Open'].values)/data['Open'].values
rc = (data['Close'].values-data['Open'].values)/data['Open'].values
ro = (data['Open'].values-data['Close'].values)/data['Close'].values
# volumne data
vol = data['Volume'].values

Data = collections.namedtuple('Data', field_names=['date','high', 'low', 'close', 'open', 'volume'])
data=Data(date=dt,high=rh, low=rl, close=rc, open=ro, volume=vol)

In [7]:
# The algorithms require a vectorized environment to run
env = StockTradingEnv(data)
model = PPO("MlpPolicy", env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [8]:
model.learn(total_timesteps=20000)

-----------------------------
| time/              |      |
|    fps             | 1121 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 483          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0012705124 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.84        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 5.67e+08     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00188     |
|    std                  | 1            |
|    value_loss           | 1.18e+09     |
----------------

<stable_baselines3.ppo.ppo.PPO at 0x7f1cf8ca2ad0>

In [9]:
obs = env.reset()
for i in range(2000):
  action, _states = model.predict(obs)
  obs, rewards, done, info = env.step(action)
  env.render()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Net worth: 10000.0 (Max net worth: 10000.000000000005)
Profit: 0.0
Step: 3180
Balance: 3577.189650964103
Shares held: -487301 (Total sold: -48027405)
Avg cost for held shares: -0.013180375884793788 (Total sales value: 633019.2506712246)
Net worth: 10000.0 (Max net worth: 10000.000000000005)
Profit: 0.0
Step: 3181
Balance: 3577.189650964103
Shares held: -487301 (Total sold: -48027405)
Avg cost for held shares: -0.013180375884793788 (Total sales value: 633019.2506712246)
Net worth: 10000.0 (Max net worth: 10000.000000000005)
Profit: 0.0
Step: 3182
Balance: 4844.706858677067
Shares held: -391134 (Total sold: -48123572)
Avg cost for held shares: -0.013180375884793788 (Total sales value: 634286.7678789375)
Net worth: 10000.0 (Max net worth: 10000.000000000005)
Profit: 0.0
Step: 3183
Balance: 4844.706858677067
Shares held: -391134 (Total sold: -48123572)
Avg cost for held shares: -0.013180375884793788 (Total sales value: 634286