In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import pandas as pd
import os
import sys

sys.path.append('Enviroment')
from tf_trade_enviroment import MyTradeEnv

from tf_agents.environments import utils
from tf_agents.environments import tf_py_environment

from tf_agents.agents.dqn import dqn_agent
from tf_agents.networks import q_rnn_network
from tf_agents.utils import common

from learningClass import learningHelper



tf.random.set_seed(12)
tf.print(tf.config.list_physical_devices('GPU') )
tf.compat.v1.enable_v2_behavior()

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


## Read OCHLV data from the file

In [2]:
df = pd.read_csv('test.csv')
df = df.drop(['Date','Adj Close'], axis=1)
train_index = 200
df_train = df.iloc[:train_index]
df_test = df.iloc[train_index:]
df_train.head(2)

Unnamed: 0,High,Low,Open,Close,Volume
0,111.440002,107.349998,111.389999,109.330002,53204600.0
1,108.650002,105.410004,108.290001,106.25,64285500.0


## Create and validate enviroment

In [3]:
environment = MyTradeEnv(df_test)
utils.validate_py_environment(environment, episodes=2)

print('action_spec:', environment.action_spec())
print('time_step_spec.observation:', environment.time_step_spec().observation)
print('time_step_spec.step_type:', environment.time_step_spec().step_type)
print('time_step_spec.discount:', environment.time_step_spec().discount)
print('time_step_spec.reward:', environment.time_step_spec().reward)



action_spec: BoundedArraySpec(shape=(), dtype=dtype('int32'), name='action', minimum=0, maximum=3)
time_step_spec.observation: {'price': BoundedArraySpec(shape=(20, 5), dtype=dtype('float32'), name='obs_price', minimum=0.0, maximum=3.4028234663852886e+38), 'pos': BoundedArraySpec(shape=(2,), dtype=dtype('int32'), name='obs_pos', minimum=0, maximum=1)}
time_step_spec.step_type: ArraySpec(shape=(), dtype=dtype('int32'), name='step_type')
time_step_spec.discount: BoundedArraySpec(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0)
time_step_spec.reward: ArraySpec(shape=(), dtype=dtype('float32'), name='reward')


## Create traning and validation enviroment

In [4]:
train_env = tf_py_environment.TFPyEnvironment(MyTradeEnv(df_train))
eval_env = tf_py_environment.TFPyEnvironment(MyTradeEnv(df_test))

## Define trainig paramters

In [5]:
learning_rate = 1e-3  

## Create q_network

In [6]:
#network configuration
input_fc_layer_params = (40,)
lstm_size=(20,)
output_fc_layer_params=(20,)

# as we are using dictionary in our enviroment, we will create preprocessing layer
preprocessing_layers = {
    'price': tf.keras.layers.Flatten(),
    'pos': tf.keras.layers.Dense(2)
    }
preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)

#create a q_RNNnet
q_net = q_rnn_network.QRnnNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    preprocessing_layers=preprocessing_layers,
    preprocessing_combiner=preprocessing_combiner,
    input_fc_layer_params=input_fc_layer_params,
    lstm_size=lstm_size,
    output_fc_layer_params=output_fc_layer_params)    

## Create the DQN-agent

In [7]:
#create optimizer
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

#create a global step coubter
#train_step_counter = tf.Variable(0)
global_step = tf.compat.v1.train.get_or_create_global_step()

#create agent
agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    #train_step_counter=train_step_counter)
    train_step_counter=global_step)

agent.initialize()

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

In [8]:
magent = learningHelper(train_env=train_env, test_env=eval_env, agent=agent, global_step=global_step, collect_episodes = 10,
eval_interval=5, verbose=0, batch_size=64, train_sequence_length=4, chkpdir='./rnn_chkp/')
magent.restore_check_point()

selected agent collect_policy


In [16]:
%%time
magent.train_agent(600)
#magent.train_agent_with_avg_ret_condition(50, 10000, 100)

Traning Loss 158700.34375
Global steps 249: Traning Loss 118368.703125
Global steps 250: Traning Loss 120660.703125
Global steps 251: Traning Loss 83246.71875
Global steps 252: Traning Loss 87649.125
Global steps 253: Traning Loss 77563.2890625
Global steps 254: Traning Loss 130366.2734375
Global steps 255: Traning Loss 116801.796875
Global steps 256: Traning Loss 160684.6875
Global steps 257: Traning Loss 182066.15625
Global steps 258: Traning Loss 206565.28125
Global steps 259: Traning Loss 104983.953125
Global steps 260: Traning Loss 150390.71875
Global steps 261: Traning Loss 75692.34375
Global steps 262: Traning Loss 138266.625
Global steps 263: Traning Loss 89768.1171875
Global steps 264: Traning Loss 220452.375
Global steps 265: Traning Loss 121568.1015625
Global steps 266: Traning Loss 98701.234375
Global steps 267: Traning Loss 136979.46875
Global steps 268: Traning Loss 112515.84375
Global steps 269: Traning Loss 143790.484375
Global steps 270: Traning Loss 105747.4375
Global

In [9]:
for _ in range(10):
    magent.evaluate_agent(80)


eval episodes = 80: Average Return = 74.49990844726562
eval episodes = 80: Average Return = 83.89998626708984
eval episodes = 80: Average Return = 545.6998901367188
eval episodes = 80: Average Return = -88.49996185302734
eval episodes = 80: Average Return = 261.39990234375
eval episodes = 80: Average Return = -61.699981689453125
eval episodes = 80: Average Return = 349.20001220703125
eval episodes = 80: Average Return = -49.59981918334961
eval episodes = 80: Average Return = 188.2997589111328
eval episodes = 80: Average Return = 274.60015869140625


In [37]:
magent.store_check_point()

In [16]:
magent.restore_check_point()