 This part is trying to build an A2C model for trading.

In [1]:
import matplotlib.pyplot as plt
import os
import gym
import numpy as np
import tensorflow as tf
import tensorlayer as tl

In [2]:
import gym

import preprocess_env3_8features
import preprocessing_adding_technical_indicator

import pandas as pd
import numpy as np
import random
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as kl

In [3]:
# actor nn
class Actor(object):

    def __init__(self, state_dim =8, action_dim =3, lr=0.001):
        

        input_layer = tl.layers.Input([None, state_dim])
        layer = tl.layers.Dense(n_units=16, act=tf.nn.relu6)(input_layer)
        layer = tl.layers.Dense(n_units=action_dim)(layer)
#         input_layer =kl.Input(shape=state_dim)
#         layer1 = kl.Dense(12,activation ='elu')(input_layer)
#         layer2 = kl.Dense(3)(layer1)

        self.model = tl.models.Model(inputs=input_layer, outputs=layer)  # define the model in and out
#         self.model = keras.Model(inputs = [input_layer],outputs =[layer2])
        self.model.train()
        self.optimizer = tf.optimizers.Adam(lr)

    def learn(self, state, action, td_error):  # update weights according to cross_entropy
        with tf.GradientTape() as tape:
            _logits = self.model(np.array([state]))
            _exp_v = tl.rein.cross_entropy_reward_loss(
                logits=_logits, actions=[action], rewards=td_error)
        grad = tape.gradient(_exp_v, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))

    def get_action(self, state):  # option : greedy para to get the max prob
        _logits = self.model(np.array([state]))
        _prob = tf.nn.softmax(_logits).numpy()
        return tl.rein.choice_action_by_probs(_prob.ravel())


In [4]:
# critic nn
class Critic(object):

    def __init__(self, state_dim =8, lr=0.01):
        input_layer = tl.layers.Input([None, state_dim])
        layer = tl.layers.Dense(n_units=16, act=tf.nn.relu)(input_layer) # 12  is the twice of the input dimension
        layer = tl.layers.Dense(n_units=1, act=None)(layer)  # output one

        self.model = tl.models.Model(inputs=input_layer, outputs=layer)
        self.model.train()
        self.optimizer = tf.optimizers.Adam(lr)

    def learn(self, state, reward, state_, done):
        d = 0 if done else 1

        with tf.GradientTape() as tape:
            v = self.model(np.array([state]))
            v_ = self.model(np.array([state_]))  # new state  value
            td_error = reward + d * gamma * v_ - v
            loss = tf.square(td_error)  # MSE
        grads = tape.gradient(loss, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
        return td_error


In [5]:
data = preprocessing_adding_technical_indicator.add_technical_indicator()
env = preprocess_env3_8features.StockEnv(data)
LR_A = 0.001  # learning rate for actor
LR_C = 0.01  # learning rate for critic
decay = 0.005 # learning rate decay
gamma = 0.95 # discount factor

[ 1.00000e+03  0.00000e+00  0.00000e+00  2.62000e+02  0.00000e+00
 -6.66667e+01  4.69697e+01  3.16346e+05]


In [6]:
actor = Actor(env.observation_space.shape[0], env.action_space.n, lr=LR_A)
critic = Critic(env.observation_space.shape[0], lr=LR_C)

[TL] Input  _inputlayer_1: [None, 8]
[TL] Dense  dense_1: 16 relu6
[TL] Dense  dense_2: 3 No Activation
[TL] Input  _inputlayer_2: [None, 8]
[TL] Dense  dense_3: 16 relu
[TL] Dense  dense_4: 1 No Activation


In [7]:
# def trade_time_unit(env, obs):  #  using greedy policy
#     action = actor.get_action(obs) # get the action
#     print(action)
#     next_state, reward, done, info = env.step(action)
#     return next_state, reward, done, info

In [8]:
def train_one_episode():
    env.reset()
    state = env.reset().astype(np.float32)
    reward_total = 0
    for step in range(2100):
        action = actor.get_action(state)
        next_state, reward, done, info = env.step(action)
        reward_total = reward_total + reward
        next_state = next_state.astype(np.float32)
        td_error = critic.learn(state,reward,next_state,done)
        actor.learn(state,action,td_error)
        state = next_state
        if done:
            print(state)
            break
    print("For this episode,reward total:{}".format(reward_total))

In [9]:
train_one_episode()

[0.00000e+00 1.68000e+02 6.43900e+00 1.77000e+02 5.20000e-02 8.72270e+00
 5.68858e+01 3.12915e+05]
For this episode,reward total:126.82500000000005


In [10]:
for i in range(50):
    train_one_episode()

[0.00000e+00 1.80000e+02 4.54000e+00 1.77000e+02 5.20000e-02 8.72270e+00
 5.68858e+01 3.12915e+05]
For this episode,reward total:-205.4999999999999
[0.00000e+00 1.76000e+02 5.48400e+00 1.77000e+02 5.20000e-02 8.72270e+00
 5.68858e+01 3.12915e+05]
For this episode,reward total:-40.299999999999955
[0.00000e+00 1.70000e+02 5.37100e+00 1.77000e+02 5.20000e-02 8.72270e+00
 5.68858e+01 3.12915e+05]
For this episode,reward total:-60.075000000000045
[0.00000e+00 1.84000e+02 5.32100e+00 1.77000e+02 5.20000e-02 8.72270e+00
 5.68858e+01 3.12915e+05]
For this episode,reward total:-68.82499999999993
[9.10215e+02 0.00000e+00 0.00000e+00 1.77000e+02 5.20000e-02 8.72270e+00
 5.68858e+01 3.12915e+05]
For this episode,reward total:-89.78499999999997
[0.00000e+00 1.79000e+02 4.64000e+00 1.77000e+02 5.20000e-02 8.72270e+00
 5.68858e+01 3.12915e+05]
For this episode,reward total:-188.01600000000008
[0.00000e+00 1.70000e+02 3.40400e+00 1.77000e+02 5.20000e-02 8.72270e+00
 5.68858e+01 3.12915e+05]
For this e

In [16]:
for i in range(20):
    train_one_episode()

KeyboardInterrupt: 

In [17]:
env.action_list

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [29]:
sum(env.reward_memory)
        

-358.7786

In [24]:
count

71