In [1]:
import json
import datetime as dt
import numpy as np
import pandas as pd
import datetime
import tensorflow as tf
import pickle

from env.StockTradingEnv import StockTradingEnv
from networks.DQN import DQN
from utils.utils import convert_action

from medium article (as well for networks/DQN.py): https://towardsdatascience.com/deep-reinforcement-learning-build-a-deep-q-network-dqn-to-play-cartpole-with-tensorflow-2-and-gym-8e105744b998

### Parameters

In [2]:
df = pd.read_csv('./data/IBM_train.csv')
df = df.sort_values('Date')

N_games = 101  # number of training games
N_save = 200    # interval between save model, must be over 200
model_name = "DQN"

batch_size = 32

hidden_units = [64,128,256,256,128,64]
in_log = True

N_agent = 20 #look for N_agent agents

### Additional functions

In [3]:
def play_game(env, TrainNet, TargetNet, epsilon, copy_step):
    rewards = 0
    iter = 0
    done = False
    observations = env.reset()
    steps = 0
    while not done:
#         env.render()
        action = TrainNet.get_action(observations, epsilon) # observations is actually a single "state" ie past 5 days
        action = convert_action(action, binary_action=True)
        
        prev_observations = observations
        observations, reward, done, _ = env.step(action)
        rewards += reward    # sum of gain_net_worth
#         rewards = reward
        if done:
#             reward = -200
            env.reset()
        if steps >= 100: # Limiting the number of steps
            env.reset()
            break

        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        TrainNet.add_experience(exp)
        TrainNet.train(TargetNet)
        iter += 1
        if iter % copy_step == 0:
            TargetNet.copy_weights(TrainNet)
        steps += 1
    return rewards

def main():
    max_score = -10000
    for k in range(N_agent):
        env = StockTradingEnv(df, in_log=in_log)
        gamma = 0.9998
        copy_step = 25
    #     num_states = len(env.observation_space.sample())
        input_shape = env.observation_space.sample().shape
    #     num_actions = env.action_space.n
        num_actions = 2                        # TODO: CHANGE THIS TO CONTINUOUS VALUES
        max_experiences = 1000
        min_experiences = 25
        lr = 1e-2
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir = 'logs/dqn/' + current_time
        summary_writer = tf.summary.create_file_writer(log_dir)

        TrainNet = DQN(input_shape, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
        TargetNet = DQN(input_shape, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
        print("\r DeepQ Networks successfully created",end="")
        total_rewards = np.empty(N_games)
        epsilon = 0.9
        decay = 0.99
        min_epsilon = 0.1
        print("\r Starting training...",end="")
        for n in range(N_games):
            epsilon = max(min_epsilon, epsilon * decay)
            total_reward = play_game(env, TrainNet, TargetNet, epsilon, copy_step)
            total_rewards[n] = total_reward
            avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
            with summary_writer.as_default():
                tf.summary.scalar('episode reward', total_reward, step=n)
                tf.summary.scalar('running avg reward(100)', avg_rewards, step=n)
            if n % 20 == 0:
                print("\r episode:", n, "eps:", epsilon, "avg reward (last 100):", avg_rewards,end="")

            # Save the model
    #             if n % N_save == 0 and n>=N_save:
    #                 TrainNet.model.save_weights('save_models/{}_{}'.format(model_name, n), save_format='tf')

        env.close()


        ### val part
        df_val = pd.read_csv('./data/IBM_val.csv')

        env = StockTradingEnv(df_val)
        input_shape = env.observation_space.sample().shape
        observation = env.reset_to_day_one().reshape(1,input_shape[0],input_shape[1]).astype('float32') # necessary to reshape each observation
        
        df_val['net_worth'] = 0
        df_val['Index'] = 0
        
        for i in range(len(df_val.loc[:, 'Open'].values) - 6):
            prediction = TrainNet.model.predict(observation.reshape(1,input_shape[0],input_shape[1]).astype('float32'))
            action = np.argmax(prediction[0])
            action = convert_action(action, binary_action=True)
            df_val.loc[i+5, "net_worth"] = env.net_worth
            df_val.loc[i+5, "Index"] = df_val.loc[i+5, "Close"] * 10000 / df_val.loc[5, "Close"]
            observation, reward, done, _ = env.step(action)

#         net_worth = env.net_worth
            diff = df_val["net_worth"][5:-1] - df_val["Index"][5:-1]
            score = diff.mean()

        if score > max_score:
            max_score = score
            TrainNet.model.save_weights('save_models/{}_best'.format(model_name), save_format='tf')

        print("\r agent {} of {}, score: {}, max_score: {}".format(k, N_agent, score, max_score))

        env.close()


# Start the trading

The episode reward is the difference of net_worth between the beginning and the end of the step
<br>
Initial account balance is 10,000
<br>
To see live results in Tensorboard: tensorboard --logdir *log_dir

In [4]:
main()

 agent 0 of 20, score: 272.9628052236537, max_score: 272.9628052236537302970322
 agent 1 of 20, score: 314.1417277596614, max_score: 314.14172775966147821790374
 agent 2 of 20, score: -32.88428496534297, max_score: 314.1417277596614613857545
 agent 3 of 20, score: 55.32870668865718, max_score: 314.1417277596614128712778
 agent 4 of 20, score: -84.01479353133927, max_score: 314.141727759661428712634
 agent 5 of 20, score: 191.57947851165426, max_score: 314.1417277596614792082676
 agent 6 of 20, score: -11.720845284342838, max_score: 314.14172775966141683142
 agent 7 of 20, score: -32.88428496534297, max_score: 314.1417277596614415841274
 agent 8 of 20, score: -92.5825205543429, max_score: 314.1417277596614745544512
 agent 9 of 20, score: -17.34973190634184, max_score: 314.141727759661481188187
 agent 10 of 20, score: -32.88428496534297, max_score: 314.14172775966145544167
 agent 11 of 20, score: -30.64948131834238, max_score: 314.1417277596614841598
 agent 12 of 20, score: -615.30843297