In [69]:
import gym
import pandas as pd
import numpy as np
from gym import spaces
from sklearn import preprocessing
import json

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

from datetime import datetime
import random

In [30]:
class BitcoinTradingEnv(gym.Env):
    """A Bitcoin trading environment for OpenAI gym"""
    metadata = {'render.modes': ['live', 'file', 'none']}
    scaler = preprocessing.MinMaxScaler()
    viewer = None

    def __init__(self, df, lookback_window_size=50, 
                             commission=0.00075,  
                             initial_balance=10000,
                             serial=False):
        super(BitcoinTradingEnv, self).__init__()

        self.df = df.dropna().reset_index()
        self.lookback_window_size = lookback_window_size
        self.initial_balance = initial_balance
        self.commission = commission
        self.serial = serial

        # Actions of the format Buy 1/10, Sell 3/10, Hold, etc.
        self.action_space = spaces.MultiDiscrete([3, 10])

        # Observes the OHCLV values, net worth, and trade history
        self.observation_space = spaces.Box(low=0, high=1, shape=(10, 
                        lookback_window_size + 1), dtype=np.float16)

    def reset(self):
        self.balance = self.initial_balance
        self.net_worth = self.initial_balance
        self.btc_held = 0

        self._reset_session()

        self.account_history = np.repeat([
        [self.net_worth],
        [0],
        [0],
        [0],
        [0]
              ], self.lookback_window_size + 1, axis=1)

        self.trades = []

        return self._next_observation()
    
    def _reset_session(self):
        self.current_step = 0

        if self.serial:
            self.steps_left = len(self.df) - self.lookback_window_size - 1
            self.frame_start = self.lookback_window_size
        else:
            self.steps_left = np.random.randint(1, MAX_TRADING_SESSION)
            self.frame_start = np.random.randint(
                 self.lookback_window_size, len(self.df) - self.steps_left)

        self.active_df = self.df[self.frame_start -   
           self.lookback_window_size:self.frame_start + self.steps_left]

    def _next_observation(self):
        end = self.current_step + self.lookback_window_size + 1

        obs = np.array([
        self.active_df['Open'].values[self.current_step:end],  
        self.active_df['High'].values[self.current_step:end],
        self.active_df['Low'].values[self.current_step:end],
        self.active_df['Close'].values[self.current_step:end],
        self.active_df['Volume_(BTC)'].values[self.current_step:end],
        ])

        scaled_history = self.scaler.fit_transform(self.account_history)

        obs = np.append(obs, scaled_history[:, -(self.lookback_window_size
                                                 + 1):], axis=0)

        return obs
    
    def step(self, action):
        current_price = self._get_current_price() + 0.01
        self._take_action(action, current_price)
        self.steps_left -= 1
        self.current_step += 1

        if self.steps_left == 0:
            self.balance += self.btc_held * current_price
            self.btc_held = 0
            self._reset_session()

        obs = self._next_observation()
        reward = self.net_worth
        done = self.net_worth <= 0

        return obs, reward, done, {}
    
    def _take_action(self, action, current_price):
        action_type = action[0]
        amount = action[1] / 10
        btc_bought = 0
        btc_sold = 0
        cost = 0
        sales = 0

        if action_type < 1:
            btc_bought = self.balance / current_price * amount
            cost = btc_bought * current_price * (1 + self.commission)
            self.btc_held += btc_bought
            self.balance -= cost

        elif action_type < 2:
            btc_sold = self.btc_held * amount
            sales = btc_sold * current_price  * (1 - self.commission)
            self.btc_held -= btc_sold
            self.balance += sales
            
        if btc_sold > 0 or btc_bought > 0:
            self.trades.append({
              'step': self.frame_start+self.current_step,
              'amount': btc_sold if btc_sold > 0 else btc_bought,
              'total': sales if btc_sold > 0 else cost,
              'type': "sell" if btc_sold > 0 else "buy"
            })

        self.net_worth = self.balance + self.btc_held * current_price
        self.account_history = np.append(self.account_history, [
            [self.net_worth],
            [btc_bought],
            [cost],
            [btc_sold],
            [sales]
            ], axis=1)

In [91]:
class StockTradingEnv(gym.Env):
    """A stock trading environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df):
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.reward_range = (0, MAX_ACCOUNT_BALANCE) 
        # Actions of the format Buy x%, Sell x%, Hold, etc.
        self.action_space = spaces.Box(
          low=np.array([0, 0]), high=np.array([3, 1]), dtype=np.float16)
        # Prices contains the OHCL values for the last five prices
        self.observation_space = spaces.Box(
          low=0, high=1, shape=(6, 6), dtype=np.float16)

    def reset(self):
        # Reset the state of the environment to an initial state
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0

        # Set the current step to a random point within the data frame
        self.current_step = random.randint(0, len(self.df.loc[:, 'Open'].values) - 6)
        return self._next_observation()
    
    def _next_observation(self):
        # Get the data points for the last 5 days and scale to between 0-1
        frame = np.array([
            self.df.loc[self.current_step: self.current_step +
                        5, 'Open'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'High'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Low'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Close'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Volume'].values / MAX_NUM_SHARES,
            ])
        
        # Append additional data and scale each value to between 0-1
        obs = np.append(frame, [[
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.shares_held / MAX_NUM_SHARES,
            self.cost_basis / MAX_SHARE_PRICE,
            self.total_shares_sold / MAX_NUM_SHARES,
            self.total_sales_value / (MAX_NUM_SHARES * MAX_SHARE_PRICE),
            ]], axis=0)
        
        return obs
    
    def step(self, action):
        # Execute one time step within the environment
        self._take_action(action)
        
        self.current_step += 1
        
        if self.current_step > len(self.df.loc[:, 'Open'].values) - 6:
            self.current_step = 0
        
        delay_modifier = (self.current_step / MAX_STEPS)

        reward = self.balance * delay_modifier
        done = self.net_worth <= 0
        
        obs = self._next_observation()
        
        return obs, reward, done, {}

    def _take_action(self, action):
        # Set the current price to a random price within the time step
        current_price = random.uniform(self.df.loc[self.current_step, "Open"],
                                       self.df.loc[self.current_step, "Close"])

        action_type = action[0]
        amount = action[1]

        if action_type < 1:
            # Buy amount % of balance in shares
            total_possible = self.balance / current_price
            shares_bought = total_possible * amount
            prev_cost = self.cost_basis * self.shares_held
            additional_cost = shares_bought * current_price
            self.balance -= additional_cost
            self.cost_basis = (prev_cost + additional_cost) / (self.shares_held + shares_bought)
            self.shares_held += shares_bought
        elif action_type < 2:
            # Sell amount % of shares held
            shares_sold = self.shares_held * amount 
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_sold += shares_sold
            self.total_sales_value += shares_sold * current_price

        self.netWorth = self.balance + self.shares_held * current_price

        if self.net_worth > self.max_net_worth:
            self.max_net_worth = self.net_worth

        if self.shares_held == 0:
            self.cost_basis = 0
    
    def render(self, mode='human', close=False):
        # Render the environment to the screen
        profit = self.net_worth - INITIAL_ACCOUNT_BALANCE
        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(f'Shares held: {self.shares_held} (Total sold: {self.total_shares_sold})')
        print(f'Avg cost for held shares: {self.cost_basis} (Total sales value: {self.total_sales_value})')
        print(f'Net worth: {self.net_worth} (Max net worth: {self.max_net_worth})')
        print(f'Profit: {profit}')

In [90]:
%debug
df = pd.read_csv('EURUSD60.csv', delimiter='\t',
                       names=['Date','Open','High','Low','Close','Volume'])

# df = pd.read_csv('AAPL.csv')
# df = df.sort_values('Date')

MAX_ACCOUNT_BALANCE = 2147483647
INITIAL_ACCOUNT_BALANCE = 10000
MAX_NUM_SHARES = 2147483647
MAX_SHARE_PRICE = 5000
MAX_OPEN_POSITIONS = 5
MAX_STEPS = 20000

# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: StockTradingEnv(df)])
model = PPO2(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=20000)
obs = env.reset()

for i in range(2000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()

> [1;32mc:\programdata\anaconda3\envs\rl\lib\site-packages\tensorflow_core\python\client\session.py[0m(1443)[0;36m_call_tf_sessionrun[1;34m()[0m
[1;32m   1441 [1;33m    return tf_session.TF_SessionRun_wrapper(self._session, options, feed_dict,
[0m[1;32m   1442 [1;33m                                            [0mfetch_list[0m[1;33m,[0m [0mtarget_list[0m[1;33m,[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m-> 1443 [1;33m                                            run_metadata)
[0m[1;32m   1444 [1;33m[1;33m[0m[0m
[0m[1;32m   1445 [1;33m  [1;32mdef[0m [0m_call_tf_sessionprun[0m[1;33m([0m[0mself[0m[1;33m,[0m [0mhandle[0m[1;33m,[0m [0mfeed_dict[0m[1;33m,[0m [0mfetch_list[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m
ipdb> n




---------------------------------------
| approxkl           | 1.31851765e-08 |
| clipfrac           | 0.0            |
| explained_variance | -1.19e-07      |
| fps                | 342            |
| n_updates          | 1              |
| policy_entropy     | 2.837864       |
| policy_loss        | -1.5216414e-05 |
| serial_timesteps   | 128            |
| time_elapsed       | 0              |
| total_timesteps    | 128            |
| value_loss         | 47914148.0     |
---------------------------------------
---------------------------------------
| approxkl           | 2.4189742e-08  |
| clipfrac           | 0.0            |
| explained_variance | 0              |
| fps                | 822            |
| n_updates          | 2              |
| policy_entropy     | 2.8378885      |
| policy_loss        | -1.8379651e-05 |
| serial_timesteps   | 256            |
| time_elapsed       | 0.375          |
| total_timesteps    | 256            |
| value_loss         | 46588320.0     |


--------------------------------------
| approxkl           | 5.3632408e-05 |
| clipfrac           | 0.0           |
| explained_variance | 0             |
| fps                | 781           |
| n_updates          | 17            |
| policy_entropy     | 2.8624165     |
| policy_loss        | -0.0025192956 |
| serial_timesteps   | 2176          |
| time_elapsed       | 2.78          |
| total_timesteps    | 2176          |
| value_loss         | 45239730.0    |
--------------------------------------
--------------------------------------
| approxkl           | 0.000241394   |
| clipfrac           | 0.0           |
| explained_variance | 0             |
| fps                | 792           |
| n_updates          | 18            |
| policy_entropy     | 2.863509      |
| policy_loss        | -0.0010682138 |
| serial_timesteps   | 2304          |
| time_elapsed       | 2.95          |
| total_timesteps    | 2304          |
| value_loss         | 37890948.0    |
-------------------------

--------------------------------------
| approxkl           | 0.0011416039  |
| clipfrac           | 0.001953125   |
| explained_variance | 0             |
| fps                | 798           |
| n_updates          | 34            |
| policy_entropy     | 2.8855917     |
| policy_loss        | -0.0048526246 |
| serial_timesteps   | 4352          |
| time_elapsed       | 5.54          |
| total_timesteps    | 4352          |
| value_loss         | 568909000.0   |
--------------------------------------
-------------------------------------
| approxkl           | 0.0033661106 |
| clipfrac           | 0.03515625   |
| explained_variance | -1.19e-07    |
| fps                | 820          |
| n_updates          | 35           |
| policy_entropy     | 2.8873675    |
| policy_loss        | -0.012520377 |
| serial_timesteps   | 4480         |
| time_elapsed       | 5.71         |
| total_timesteps    | 4480         |
| value_loss         | 452450430.0  |
-------------------------------------

KeyboardInterrupt: 