 This part is trying to build an A2C model for trading.

In [1]:
import matplotlib.pyplot as plt
import os
import gym
import numpy as np
import tensorflow as tf
import tensorlayer as tl

In [2]:
import gym

import preprocessing_env_custom_with_vol 
import preprocessing_adding_technical_indicator

import pandas as pd
import numpy as np
import random
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as kl

In [3]:
# actor nn
class Actor(object):

    def __init__(self, state_dim =6, action_dim =3, lr=0.001):
        

        input_layer = tl.layers.Input([None, state_dim])
        layer = tl.layers.Dense(n_units=12, act=tf.nn.relu6)(input_layer)
        layer = tl.layers.Dense(n_units=action_dim)(layer)
#         input_layer =kl.Input(shape=state_dim)
#         layer1 = kl.Dense(12,activation ='elu')(input_layer)
#         layer2 = kl.Dense(3)(layer1)

        self.model = tl.models.Model(inputs=input_layer, outputs=layer)  # define the model in and out
#         self.model = keras.Model(inputs = [input_layer],outputs =[layer2])
        self.model.train()
        self.optimizer = tf.optimizers.Adam(lr)

    def learn(self, state, action, td_error):  # update weights according to cross_entropy
        with tf.GradientTape() as tape:
            _logits = self.model(np.array([state]))
            _exp_v = tl.rein.cross_entropy_reward_loss(
                logits=_logits, actions=[action], rewards=td_error)
        grad = tape.gradient(_exp_v, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))

    def get_action(self, state):  # option : greedy para to get the max prob
        _logits = self.model(np.array([state]))
        _prob = tf.nn.softmax(_logits).numpy()
        return tl.rein.choice_action_by_probs(_prob.ravel())


In [4]:
# critic nn
class Critic(object):

    def __init__(self, state_dim =6, lr=0.01):
        input_layer = tl.layers.Input([None, state_dim])
        layer = tl.layers.Dense(n_units=12, act=tf.nn.relu)(input_layer) # 12  is the twice of the input dimension
        layer = tl.layers.Dense(n_units=1, act=None)(layer)  # output one

        self.model = tl.models.Model(inputs=input_layer, outputs=layer)
        self.model.train()
        self.optimizer = tf.optimizers.Adam(lr)

    def learn(self, state, reward, state_, done):
        d = 0 if done else 1

        with tf.GradientTape() as tape:
            v = self.model(np.array([state]))
            v_ = self.model(np.array([state_]))  # new state  value
            td_error = reward + d * gamma * v_ - v
            loss = tf.square(td_error)  # MSE
        grads = tape.gradient(loss, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
        return td_error


In [5]:
data = preprocessing_adding_technical_indicator.add_technical_indicator()
env = preprocessing_env_custom_with_vol .StockEnv(data)
LR_A = 0.001  # learning rate for actor
LR_C = 0.01  # learning rate for critic
decay = 0.005 # learning rate decay
gamma = 0.95 # discount factor

[ 1.00000e+03  0.00000e+00  0.00000e+00  2.62000e+02  0.00000e+00
 -6.66667e+01  4.69697e+01  3.16346e+05]


In [6]:
actor = Actor(env.observation_space.shape[0], env.action_space.n, lr=LR_A)
critic = Critic(env.observation_space.shape[0], lr=LR_C)

[TL] Input  _inputlayer_1: [None, 6]
[TL] Dense  dense_1: 12 relu6
[TL] Dense  dense_2: 3 No Activation
[TL] Input  _inputlayer_2: [None, 6]
[TL] Dense  dense_3: 12 relu
[TL] Dense  dense_4: 1 No Activation


In [7]:
# def trade_time_unit(env, obs):  #  using greedy policy
#     action = actor.get_action(obs) # get the action
#     print(action)
#     next_state, reward, done, info = env.step(action)
#     return next_state, reward, done, info

In [8]:
def train_one_episode():
    env.reset()
    state = env.reset().astype(np.float32)
    reward_total = 0
    for step in range(2100):
        action = actor.get_action(state)
        next_state, reward, done, info = env.step(action)
        reward_total = reward_total + reward
        next_state = next_state.astype(np.float32)
        td_error = critic.learn(state,reward,next_state,done)
        actor.learn(state,action,td_error)
        state = next_state
        if done:
            print(state)
            break
    print("For this episode,reward total:{}".format(reward_total))

In [30]:
env.money_memory

[1000,
 1003.8168,
 1000.0,
 977.0992,
 961.8321,
 1000.0,
 980.916,
 996.1832,
 1000.0,
 980.916,
 965.6489,
 961.8321,
 965.6489,
 984.7328,
 954.1985,
 961.8321,
 946.5649,
 912.2137,
 912.2137,
 923.6641,
 927.4809,
 923.6641,
 935.1145,
 923.6641,
 923.6641,
 919.8473,
 927.4809,
 885.4962,
 904.5802,
 942.7481,
 927.4809,
 931.2977,
 893.1298,
 923.6641,
 938.9313,
 942.7481,
 931.2977,
 923.6641,
 904.5802,
 950.3817,
 935.1145,
 942.7481,
 919.8473,
 950.3817,
 923.6641,
 935.1145,
 935.1145,
 931.2977,
 927.4809,
 923.6641,
 919.8473,
 927.4809,
 919.8473,
 916.0305,
 935.1145,
 935.1145,
 942.7481,
 938.9313,
 954.1985,
 965.6489,
 969.4656,
 919.8473,
 931.2977,
 942.7481,
 942.7481,
 931.2977,
 908.3969,
 900.7634,
 896.9466,
 950.3817,
 946.5649,
 931.2977,
 946.5649,
 912.2137,
 923.6641,
 923.6641,
 912.2137,
 935.1145,
 904.5802,
 900.7634,
 912.2137,
 900.7634,
 896.9466,
 900.7634,
 923.6641,
 931.2977,
 916.0305,
 908.3969,
 931.2977,
 946.5649,
 965.6489,
 942.7481,

In [81]:
sta =np.array([1]+[2]+[3])

In [83]:
sta[2]

3

In [32]:
state = env.reset().astype(np.float32)

In [33]:
action = actor.get_action(state)

In [64]:
next_state, reward, done, info = env.step(action)
next_state = next_state.astype(np.float32)
state =next_state
action = actor.get_action(state)
action

1

In [72]:
next_state

array([ 1.000000e+00,  2.430000e+02, -1.042200e+00, -1.044731e+02,
        2.580940e+01,  3.176570e+05], dtype=float32)

In [None]:
next_state = next_state.astype(np.float32)
td_error = critic.learn(state,reward,next_state,done)
actor.learn(state,action,td_error)
state = next_state

In [73]:
next_state = next_state.astype(np.float32)

In [75]:
td_error = critic.learn(state,reward,next_state,done)
td_error

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[3.990109]], dtype=float32)>

In [76]:
actor.learn(state,action,td_error)

In [77]:
state =next_state
actor.get_action(next_state)

1

In [69]:
_prob = tf.nn.softmax(a).numpy()
_prob 

array([[5.7747934e-10, 1.0000000e+00, 6.7969017e-09]], dtype=float32)

In [70]:
tl.rein.choice_action_by_probs(_prob.ravel())

1

In [25]:
for i in range(100):
    train_one_episode()

[1.00000e+00 1.77000e+02 5.20000e-02 8.72270e+00 5.68858e+01 3.12915e+05]
For this episode,reward total:-540.5825999999997
[1.00000e+00 1.77000e+02 5.20000e-02 8.72270e+00 5.68858e+01 3.12915e+05]
For this episode,reward total:-292.7492
[1.00000e+00 1.77000e+02 5.20000e-02 8.72270e+00 5.68858e+01 3.12915e+05]
For this episode,reward total:-394.6581000000001
[1.00000e+00 1.77000e+02 5.20000e-02 8.72270e+00 5.68858e+01 3.12915e+05]
For this episode,reward total:-323.8093
[1.00000e+00 1.77000e+02 5.20000e-02 8.72270e+00 5.68858e+01 3.12915e+05]
For this episode,reward total:-280.5167999999999
[1.00000e+00 1.77000e+02 5.20000e-02 8.72270e+00 5.68858e+01 3.12915e+05]
For this episode,reward total:-441.1003000000003
[1.00000e+00 1.77000e+02 5.20000e-02 8.72270e+00 5.68858e+01 3.12915e+05]
For this episode,reward total:-503.5845000000001
[1.00000e+00 1.77000e+02 5.20000e-02 8.72270e+00 5.68858e+01 3.12915e+05]
For this episode,reward total:-132.94479999999987
[1.00000e+00 1.77000e+02 5.20000e

KeyboardInterrupt: 

In [27]:
env.action_list

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [29]:
sum(env.reward_memory)
        

-358.7786

In [24]:
count

71