In [1]:
import json
import datetime as dt
import numpy as np
import pandas as pd
import datetime
import tensorflow as tf

from env.StockTradingEnv import StockTradingEnv
from networks.DQN import DQN
from utils.utils import convert_action

from medium article (as well for networks/DQN.py): https://towardsdatascience.com/deep-reinforcement-learning-build-a-deep-q-network-dqn-to-play-cartpole-with-tensorflow-2-and-gym-8e105744b998

### Parameters

In [2]:
N_games = 50000  # number of training games
N_save = 500    # interval between save model, must be over 200
model_name = "DQN"

batch_size = 32

### Additional functions

In [3]:
def play_game(env, TrainNet, TargetNet, epsilon, copy_step):
    rewards = 0
    iter = 0
    done = False
    observations = env.reset()
    steps = 0
    while not done:
#         env.render()
        action = TrainNet.get_action(observations, epsilon) # observations is actually a single "state" ie past 5 days
        action = convert_action(action)                # TODO: REMOVE THIS
        prev_observations = observations
        observations, reward, done, _ = env.step(action)
        rewards += reward    # sum of gain_net_worth
#         rewards = reward
        if done:
#             reward = -200
            env.reset()
        if steps >= 100: # Limiting the number of steps
            env.reset()
            break

        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        TrainNet.add_experience(exp)
        TrainNet.train(TargetNet)
        iter += 1
        if iter % copy_step == 0:
            TargetNet.copy_weights(TrainNet)
        steps += 1
    return rewards


# def make_video(env, TrainNet):
#     env = wrappers.Monitor(env, os.path.join(os.getcwd(), "videos"), force=True)
#     rewards = 0
#     steps = 0
#     done = False
#     observation = env.reset()
#     while not done:
#         env.render()
#         action = TrainNet.get_action(observation, 0)
#         action = convert_action(action)                # TODO: REMOVE THIS
#         observation, reward, done, _ = env.step(action)
#         steps += 1
#         rewards += reward
#     print("Testing steps: {} rewards {}: ".format(steps, rewards))


def main():
    df = pd.read_csv('./data/AAPL.csv')
    df = df.sort_values('Date')
    env = StockTradingEnv(df)
    gamma = 0.99
    copy_step = 25
#     num_states = len(env.observation_space.sample())
    input_shape = env.observation_space.sample().shape
#     num_actions = env.action_space.n
    num_actions = 3                        # TODO: CHANGE THIS TO CONTINUOUS VALUES
    hidden_units = [200, 200]
    max_experiences = 10000
    min_experiences = 100
    lr = 1e-2
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/dqn/' + current_time
    summary_writer = tf.summary.create_file_writer(log_dir)

    TrainNet = DQN(input_shape, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    TargetNet = DQN(input_shape, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    print("DeepQ Networks successfully created")
    N_games = 50000
    total_rewards = np.empty(N_games)
    epsilon = 0.9
    decay = 0.999
    min_epsilon = 0.1
    print("Starting training...")
    for n in range(N_games):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward = play_game(env, TrainNet, TargetNet, epsilon, copy_step)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
        with summary_writer.as_default():
            tf.summary.scalar('episode reward', total_reward, step=n)
            tf.summary.scalar('running avg reward(100)', avg_rewards, step=n)
        if n % 100 == 0:
            print("episode:", n, "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):", avg_rewards)
        
        # Save the model
        if n % N_save == 0 and n>=N_save:
            TrainNet.model.save_weights('save_models/{}_{}'.format(model_name, n), save_format='tf')

        
    print("avg reward for last 100 episodes:", avg_rewards)
#     make_video(env, TrainNet)
    env.close()

# Start the trading

The episode reward is the difference of net_worth between the beginning and the end of the step
<br>
Initial account balance is 10,000
<br>
To see live results in Tensorboard: tensorboard --logdir *log_dir

In [None]:
main()

DeepQ Networks successfully created
Starting training...
episode: 0 episode reward: 576.4314990782987 eps: 0.8991 avg reward (last 100): 576.4314990782987
episode: 100 episode reward: 2091.9110573285834 eps: 0.8134986194699355 avg reward (last 100): 437.81961902871564
episode: 200 episode reward: -692.3311444692681 eps: 0.7360471625842407 avg reward (last 100): 557.6891752292472
episode: 300 episode reward: 708.5265815975999 eps: 0.6659696926115485 avg reward (last 100): 310.81851682017873
episode: 400 episode reward: 639.9018487561952 eps: 0.6025641480906593 avg reward (last 100): 326.7230239058313
episode: 500 episode reward: -4.293368143782573 eps: 0.545195309324691 avg reward (last 100): 128.43589237196207
episode: 600 episode reward: -1625.946387162512 eps: 0.49328843452021 avg reward (last 100): 99.54117269678031
episode: 700 episode reward: 293.9183725018902 eps: 0.44632350181590114 avg reward (last 100): 172.0062252980823
episode: 800 episode reward: 274.6325225318542 eps: 0.40