Predict Long, Short or Stable position based on some indicators computed from historical price values.

- State : Features + Closing price
- Action : Pick Short, Long or Stable position
- Reward : (p1-p2)*position

shortfalls:
- No temporal modeling
- No portfolia management
- Very basic reward function
- Avg. Training reward flactuates due to intantenous rate of return differences used in the reward function.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
# np.random.seed(1335)  # for reproducibility
np.set_printoptions(precision=5, suppress=True, linewidth=150)

import pandas as pd
from matplotlib import pyplot as plt

import reinforcement_learning.backtest as twp
from reinforcement_learning.utils import *

DATA_PATH = "../data/bitcoin-historical-data/coinbaseUSD_1-min_data_2014-12-01_to_2017-10-20.csv.csv"
TIME_GRAN = 60  # time granularity in minutes
TRAIN_START = 0 / TIME_GRAN  # take only afterwards
PRED_DAYS = 50  # num prediction days for the trained models
TEST_START = ((24*60) / TIME_GRAN) * PRED_DAYS 
NUM_ACTIONS = 3

df = read_data(DATA_PATH, TIME_GRAN)
df_train = df.iloc[TRAIN_START:df.shape[0]-TEST_START,] 
df_test = df.iloc[df.shape[0]-TEST_START:,]
print(df_train.shape)
print(df_test.shape)

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.
 > There are 24318 rows
(23118, 8)
(1200, 8)


In [4]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation 
from keras.layers.recurrent import LSTM
from keras.layers.normalization import BatchNormalization
from keras.optimizers import RMSprop, Adam, SGD

num_features = 7


model = Sequential()
model.add(Dense(32, init='lecun_uniform', input_shape=(num_features,)))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dense(32, init='lecun_uniform'))
model.add(BatchNormalization())
model.add(Activation('relu'))

model.add(Dense(NUM_ACTIONS, init='uniform'))
model.add(Activation('linear'))

rms = RMSprop()
adam = Adam()
# sgd = SGD(lr=0.01, momentum=0.5)
# model.compile(loss='mse', optimizer=adam)
model.compile(loss=huber_loss, optimizer=adam)

  # This is added back by InteractiveShellApp.init_path()
  from ipykernel import kernelapp as app


In [5]:
import timeit

start_time = timeit.default_timer()

# set train and test data
train_data = df_train
test_data = df_test
start_timestep = 14

# training arguments
epochs = 100
batch_size = 128
num_sim = 924 
gamma = 0
epsilon = 0.99  # parameter setting initial random actions, that decays as learning 

# stores tuples of (S, A, R, S')
signal = pd.Series(index=np.arange(len(train_data)), data=[None]*len(train_data))

# replay memory
replay_memory = ReplayMemory(10000)

# let the agent learn
learning_progress = []
sim_count = 0
for i in range(epochs):
    avg_sim_reward = AverageMeter()
    terminal_state = 0
    states, price_data = create_states(train_data)
    time_step = start_timestep
    state = states[time_step][None, :]
        
    #while game still in progress
    while(terminal_state == 0):
        
        # We are in state S
        # 1- Compute the next action based on state S with the network
        if (random.random() < epsilon): #choose random action
            action = np.random.randint(0,NUM_ACTIONS) #assumes 4 different actions
        else: #choose best action from Q(s,a) values
            qval = model.predict(state, batch_size=1)
            action = (np.argmax(qval))
            
        # 2- Take action, observe new state S'
        new_state, new_time_step, signal, terminal_state = take_action(states, action, signal, time_step)
        assert new_time_step - time_step == 1
       
        # 3- Compute the reward
        reward = get_pos_reward(new_state, new_time_step, action, price_data, signal, eval=False)
#         print(" Action: {}, Signal: {}, Reward: {} Price: {} TimeStep: {}".format(action, 
#                                                                                   signal[time_step],
#                                                                                   reward,
#                                                                                   price_data[new_time_step],
#                                                                                   time_step))
        
        # Update average simulation reward
        avg_sim_reward.update(reward)
       
        # Add new experience to replay memory
        replay_memory.push(state, action, new_state, reward)
        
        # Experience replay storage, if we add enough new experiences
        optimize_DQN(model, replay_memory, batch_size, gamma)
        
        # One step ahead
        state = new_state
        time_step = new_time_step
        assert (states[new_time_step] - new_state).sum() == 0
        
    # eval model on test data
    test_reward, action_count = evaluate_Q(test_data, model, get_pos_reward, )
    print(action_count)
    # learning_progress.append((eval_reward))
    learning_progress.append(avg_sim_reward.avg)
    print("Epoch #: %s SimReward: %f Epsilon: %f TestReward: %f" % (i, avg_sim_reward.avg, epsilon, test_reward))
    
    # decay epsilon
    if epsilon > 0.1: #decrement epsilon over time
        epsilon -= (1.0/epochs)

elapsed = np.round(timeit.default_timer() - start_time, decimals=2)
print("Completed in %f" % (elapsed,))

[[   -1.  1188.]
 [    0.     4.]
 [    1.     8.]]
Epoch #: 0 SimReward: -0.000085 Epsilon: 0.990000 TestReward: -1038.580078
[[   -1.     5.]
 [    0.     1.]
 [    1.  1194.]]
Epoch #: 1 SimReward: 0.000057 Epsilon: 0.980000 TestReward: 1102.100098
[[   -1.     4.]
 [    0.  1196.]]
Epoch #: 2 SimReward: 0.000032 Epsilon: 0.970000 TestReward: -24.850098
[[   -1.     5.]
 [    0.     8.]
 [    1.  1187.]]
Epoch #: 3 SimReward: 0.000012 Epsilon: 0.960000 TestReward: 1274.000488
[[   -1.     5.]
 [    0.     1.]
 [    1.  1194.]]
Epoch #: 4 SimReward: 0.000050 Epsilon: 0.950000 TestReward: 1102.100098
[[   -1.  1199.]
 [    0.     1.]]
Epoch #: 5 SimReward: 0.000015 Epsilon: 0.940000 TestReward: -1131.199707
[[   -1.     5.]
 [    0.  1195.]]
Epoch #: 6 SimReward: -0.000048 Epsilon: 0.930000 TestReward: -14.549805
[[   -1.     8.]
 [    0.     1.]
 [    1.  1191.]]
Epoch #: 7 SimReward: 0.000026 Epsilon: 0.920000 TestReward: 1064.399902
[[   -1.     3.]
 [    0.     2.]
 [    1.  1195.

[[    0.  1200.]]
Epoch #: 73 SimReward: -0.000006 Epsilon: 0.260000 TestReward: 0.000000
[[   -1.  1199.]
 [    0.     1.]]
Epoch #: 74 SimReward: -0.000018 Epsilon: 0.250000 TestReward: -1131.199707
[[   -1.  1199.]
 [    0.     1.]]
Epoch #: 75 SimReward: 0.000050 Epsilon: 0.240000 TestReward: -1131.199707
[[   -1.  1199.]
 [    0.     1.]]
Epoch #: 76 SimReward: 0.000045 Epsilon: 0.230000 TestReward: -1131.199707
[[    0.     1.]
 [    1.  1199.]]
Epoch #: 77 SimReward: 0.000014 Epsilon: 0.220000 TestReward: 1131.199707
[[    0.     1.]
 [    1.  1199.]]
Epoch #: 78 SimReward: 0.000097 Epsilon: 0.210000 TestReward: 1131.199707
[[    0.     1.]
 [    1.  1199.]]
Epoch #: 79 SimReward: -0.000020 Epsilon: 0.200000 TestReward: 1131.199707
[[    0.  1200.]]
Epoch #: 80 SimReward: -0.000021 Epsilon: 0.190000 TestReward: 0.000000
[[    0.     1.]
 [    1.  1199.]]
Epoch #: 81 SimReward: -0.000023 Epsilon: 0.180000 TestReward: 1131.199707
[[   -1.  1199.]
 [    0.     1.]]
Epoch #: 82 SimR

In [6]:
reward, action_count = evaluate_Q(test_data, model, get_pos_reward, )
print(reward)
print(action_count)

1131.19970703
[[    0.     1.]
 [    1.  1199.]]


In [None]:
states, price_data = create_states(test_data, test=True)

In [None]:
signal = pd.Series(index=np.arange(len(test_data)))
state, xdata, price_data = create_states(test_data, test=True)

time_step = 1
terminal_state = 0
avg_test_reward = AverageMeter()
while(terminal_state == 0):
    #We are in state S
    # 1- Compute the next action based on state S with the network
    qval = model.predict(state, batch_size=1)
    print(qval)
    action = (np.argmax(qval))
    # 2- Take action, observe new state S'
    new_state, time_step, signal, terminal_state = take_action(state, xdata, action, signal, time_step)
        
    # 3- Compute the reward
#     reward = get_reward(new_state, time_step, action, close_prices, signal, eval=False)
#     avg_test_reward.update(reward)
    state = new_state
#     print(" # Average test reward: {}".format(avg_test_reward.avg))
    
final_reward = get_pos_reward(new_state, time_step, action, price_data, signal, eval=True)
print(" # Final test reward: {}".format(final_reward))

In [None]:
price_data = pd.Series(price_data)
bt = twp.Backtest(price_data, signal, signalType='shares')
bt.data['delta'] = bt.data['shares'].diff().fillna(0)

unique, counts = np.unique(filter(lambda v: v==v, signal.values), return_counts=True)
print(np.asarray((unique, counts)).T)

In [None]:
plt.figure(figsize=[20, 40])
plt.subplot(3,1,1)
bt.plotTrades()

plt.subplot(3,1,2)
bt.pnl.plot(style='x-')

plt.subplot(3,1,3)
plt.plot(learning_progress)

plt.savefig('reinforcement_learning/plt/summary'+'.png', bbox_inches='tight', pad_inches=1, dpi=72)
# plt.show()