In [1]:
import json
import datetime as dt
import random 

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from pandas_datareader.data import DataReader


import gym
from custom_environment import *
from utils import *

from stable_baselines3 import A2C, SAC, PPO, TD3, DDPG

from tqdm.notebook import tqdm

In [2]:
def evaluate(model, steps=None, convert=False):

    profits = []
    sims = 10

    sim_infos = []

    for i in range(sims):
        infos = []
        obs = test_env.reset()
        if steps == None:
            while True:
                if convert:
                    action = model.predict(obs.to_numpy().reshape(-1))    
                else:
                    action, _states = model.predict(obs)
                obs, rewards, done, info = test_env.step(action)
                infos.append(info)
                if done: 
                    profits.append(info['profit'])
                    # test_env.render()
                    sim_infos.append(infos)
                    break

        else:
            for j in range(steps):
                action, _states = model.predict(obs)
                obs, rewards, done, info = test_env.step(action)
                infos.append(info)
                if done or j == steps-1: 
                    profits.append(info['profit'])
                    # test_env.render()
                    sim_infos.append(infos)
                    break

        print('finished sim %d/%d'%(i+1,sims))

    pos_count = len(list(filter(lambda x: (x >= 0), profits))) 
    print('made profit - ' + str(pos_count/len(profits)))

    return sim_infos


In [3]:
print('loading data')
data = DataReader('GOGL', 'yahoo', start='2000-01-01', end='2021-01-01')

print(data.describe())

loading data
              High          Low         Open        Close        Volume  \
count  5285.000000  5285.000000  5285.000000  5285.000000  5.285000e+03   
mean     71.977223    69.307737    70.737365    70.635272  9.312302e+04   
std      54.467648    52.628059    53.671556    53.556911  1.316300e+05   
min       2.770000     2.520000     2.650000     2.550000  5.000000e+02   
25%      13.700000    12.850000    13.350000    13.300000  2.362000e+04   
50%      70.900002    68.349998    69.650002    69.550003  4.330000e+04   
75%     114.949997   110.849998   113.099998   112.849998  1.042800e+05   
max     237.500000   231.500000   235.000000   234.750000  2.127300e+06   

         Adj Close  
count  5285.000000  
mean     31.200699  
std      22.912915  
min       2.328791  
25%      10.283531  
50%      24.550537  
75%      51.769726  
max      87.927231  


In [4]:
test_data = data.tail(365)
train_data = data.head(-365)

In [5]:
env = CustomStockEnv(
    stock_df = train_data,
    pred_df = train_data,
    window_size = 14,
    initial_balance = 5000,
    min_percent_loss = .5,
    with_pred=False
    )

test_env = CustomStockEnv(
    stock_df = test_data,
    pred_df = test_data,
    window_size = 14,
    initial_balance = 5000,
    min_percent_loss = .5,
    with_pred=False,
    test_env=True,
    train_df=train_data
    )

This is a testing environment, scaling based on training data.


In [6]:
model_results = {}

In [9]:
print("Training A2C")

env.reset()
modelA2C = A2C('MlpPolicy', env, verbose=0)
modelA2C.learn(total_timesteps=30000)

print('done')
modelA2C_info = evaluate(modelA2C)
model_results["A2C"] = modelA2C_info

Training A2C
done
finished sim 1/10
finished sim 2/10
finished sim 3/10
finished sim 4/10
finished sim 5/10
finished sim 6/10
finished sim 7/10
finished sim 8/10
finished sim 9/10
finished sim 10/10
made profit - 0.0


In [10]:
print("Training PPO")

env.reset()
modelPPO = PPO('MlpPolicy', env, verbose=0)
modelPPO.learn(total_timesteps=30000)

print('done')
modelPPO_info = evaluate(modelPPO)
model_results["PPO"] = modelPPO_info

Training PPO
done
finished sim 1/10
finished sim 2/10
finished sim 3/10
finished sim 4/10
finished sim 5/10
finished sim 6/10
finished sim 7/10
finished sim 8/10
finished sim 9/10
finished sim 10/10
made profit - 0.3


In [13]:
print("Training TD3")

env.reset()
modelTD3 = TD3('MlpPolicy', env, verbose=0)
modelTD3.learn(total_timesteps=30000)

print('done')
modelTD3_info = evaluate(modelTD3)
model_results["TD3"] = modelTD3_info

Training TD3


RuntimeError: "clamp_cpu" not implemented for 'Half'

In [16]:
for name, results in model_results.items():
    print(name)
    max_profit = max([info[['profit']] for info in results])
    # for result in results:
    print(max_profit)


A2C


TypeError: list indices must be integers or slices, not list