# **Import library**

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import deque
import random
import seaborn as sns
sns.set()

  import pandas.util.testing as tm


# **Read stock dataset**

In [2]:
# Using dataset from txt file
data = pd.read_csv('googl.us.txt')
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
0,2004-08-19,50.000,52.03,47.980,50.170,44703800,0
1,2004-08-20,50.505,54.54,50.250,54.155,22857200,0
2,2004-08-23,55.375,56.74,54.525,54.700,18274400,0
3,2004-08-24,55.620,55.80,51.785,52.435,15262600,0
4,2004-08-25,52.480,54.00,51.940,53.000,9197800,0
...,...,...,...,...,...,...,...
3328,2017-11-06,1049.100,1052.59,1042.000,1042.680,913954,0
3329,2017-11-07,1049.650,1053.41,1043.000,1052.390,1303832,0
3330,2017-11-08,1050.050,1062.69,1047.050,1058.290,1214469,0
3331,2017-11-09,1048.000,1050.88,1035.850,1047.720,1793994,0


To benchmark with the paper, we will use the data stock price of Google from 01-Jan-2015 to 10-November-2017. Since 01-Jan-2015 was public holiday and no trading on that day, therefore, price at 02-Jan-2015 is used as starting point for training. The **training set** is from date 02-January-2015 until 31-December-2016 and total **504 sample points**. Period of time from 01-January-2017 until 10-November-2017 as **testing set** which has **218 samples**.

In [3]:
def get_index(date):
  for no, i in enumerate(data.Date):
    if i == date:
      index = no
  return index

In [4]:
# get index from training and testing period
train_start_index = get_index('2015-01-02')
train_end_index = get_index('2016-12-30')
test_start_index = get_index('2017-01-03')
test_end_index = get_index('2017-11-10')

train_set = data[train_start_index:train_end_index + 1]
test_set = data[test_start_index:test_end_index + 1]

In [5]:
train_set

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
2611,2015-01-02,532.60,535.800,527.880,529.55,1327665,0
2612,2015-01-05,527.15,527.990,517.750,519.46,2057089,0
2613,2015-01-06,520.50,521.210,505.550,506.64,2731728,0
2614,2015-01-07,510.95,511.490,503.650,505.15,2345823,0
2615,2015-01-08,501.16,507.500,495.020,506.91,3662169,0
...,...,...,...,...,...,...,...
3110,2016-12-23,808.01,810.970,805.110,807.80,750685,0
3111,2016-12-27,808.68,816.000,805.800,809.93,946336,0
3112,2016-12-28,813.33,813.330,802.440,804.57,1159794,0
3113,2016-12-29,802.33,805.750,798.144,802.83,998255,0


In [6]:
test_set

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
3115,2017-01-03,800.62,811.435,796.89,808.01,1932677,0
3116,2017-01-04,809.89,813.430,804.11,807.77,1486687,0
3117,2017-01-05,807.50,813.740,805.92,813.02,1305062,0
3118,2017-01-06,814.99,828.960,811.50,825.21,1975843,0
3119,2017-01-09,826.37,830.430,821.62,827.18,1365344,0
...,...,...,...,...,...,...,...
3328,2017-11-06,1049.10,1052.590,1042.00,1042.68,913954,0
3329,2017-11-07,1049.65,1053.410,1043.00,1052.39,1303832,0
3330,2017-11-08,1050.05,1062.690,1047.05,1058.29,1214469,0
3331,2017-11-09,1048.00,1050.880,1035.85,1047.72,1793994,0


In [7]:
dataset = tf.data.Dataset.from_tensor_slices(train_set.Close.values.tolist())
dataset

<TensorSliceDataset shapes: (), types: tf.float32>

In [8]:
for elem in dataset:
  print(elem.numpy())

529.55
519.46
506.64
505.15
506.91
500.72
497.06
501.8
505.93
504.01
510.45
509.94
520.39
537.3
541.95
536.72
521.19
512.43
513.23
537.55
532.2
533.3
526.1
529.83
533.88
529.28
540.16
538.0
546.01
551.16
545.01
542.65
546.45
541.8
535.0
538.65
547.33
559.29
562.63
575.02
578.79
578.33
581.43
572.9
574.1
559.85
555.69
561.36
553.0
561.64
557.61
566.16
563.67
564.95
565.37
577.54
567.0
563.64
557.55
561.14
554.7
549.49
541.31
543.95
544.86
548.84
548.02
548.54
548.64
539.78
541.04
543.52
532.74
544.53
542.92
549.18
557.46
573.66
566.12
564.37
561.39
548.77
551.16
552.84
543.04
535.08
542.04
548.95
545.78
538.73
539.49
549.2
546.49
546.67
549.28
552.51
556.81
554.52
547.19
554.25
554.18
545.32
549.21
553.95
555.29
551.69
549.53
543.48
542.16
552.6
550.04
547.47
543.0
544.87
546.6
556.18
557.52
559.68
563.39
558.57
557.95
553.06
541.25
540.04
543.3
547.34
545.62
550.03
541.7
544.65
556.11
571.73
584.18
583.96
601.78
699.62
692.84
695.35
695.1
674.73
654.77
658.27
659.66
661.43
664.56
657.5

In [None]:
class DQNAgent:
    def __init__(self, state_size, window_size, trend, skip, batch_size):
        self.state_size = state_size
        self.window_size = window_size
        self.half_window = window_size // 2
        self.trend = trend
        self.skip = skip
        self.action_size = 3
        self.batch_size = batch_size
        self.memory = deque(maxlen = 1000)
        self.inventory = []

        self.gamma = 0.95
        self.epsilon = 0.5
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999


        feed = tf.keras.dense(self.X, 256, activation = tf.nn.relu)
        self.logits = tf.layers.dense(feed, self.action_size)
        self.cost = tf.reduce_mean(tf.square(self.Y - self.logits))
        self.optimizer = tf.train.GradientDescentOptimizer(1e-5).minimize(
            self.cost
        )
        self.sess.run(tf.global_variables_initializer())

    def act(self, state):
        if random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        return np.argmax(
            self.sess.run(self.logits, feed_dict = {self.X: state})[0]
        )
    
    def get_state(self, t):
        window_size = self.window_size + 1
        d = t - window_size + 1
        block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1]
        res = []
        for i in range(window_size - 1):
            res.append(block[i + 1] - block[i])
        return np.array([res])

    def replay(self, batch_size):
        mini_batch = []
        l = len(self.memory)
        for i in range(l - batch_size, l):
            mini_batch.append(self.memory[i])
        replay_size = len(mini_batch)
        X = np.empty((replay_size, self.state_size))
        Y = np.empty((replay_size, self.action_size))
        states = np.array([a[0][0] for a in mini_batch])
        new_states = np.array([a[3][0] for a in mini_batch])
        Q = self.sess.run(self.logits, feed_dict = {self.X: states})
        Q_new = self.sess.run(self.logits, feed_dict = {self.X: new_states})
        for i in range(len(mini_batch)):
            state, action, reward, next_state, done = mini_batch[i]
            target = Q[i]
            target[action] = reward
            if not done:
                target[action] += self.gamma * np.amax(Q_new[i])
            X[i] = state
            Y[i] = target
        cost, _ = self.sess.run(
            [self.cost, self.optimizer], feed_dict = {self.X: X, self.Y: Y}
        )
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return cost
    
    def buy(self, initial_money):
        starting_money = initial_money
        states_sell = []
        states_buy = []
        inventory = []
        state = self.get_state(0)
        for t in range(0, len(self.trend) - 1, self.skip):
            action = self.act(state)
            next_state = self.get_state(t + 1)
            
            if action == 1 and initial_money >= self.trend[t] and t < (len(self.trend) - self.half_window):
                inventory.append(self.trend[t])
                initial_money -= self.trend[t]
                states_buy.append(t)
                print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money))
                
                
            elif action == 2 and len(inventory):
                bought_price = inventory.pop(0)
                initial_money += self.trend[t]
                states_sell.append(t)
                try:
                    invest = ((close[t] - bought_price) / bought_price) * 100
                except:
                    invest = 0
                print(
                    'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,'
                    % (t, close[t], invest, initial_money)
                )
            
            state = next_state
        invest = ((initial_money - starting_money) / starting_money) * 100
        total_gains = initial_money - starting_money
        return states_buy, states_sell, total_gains, invest
        
    def train(self, iterations, checkpoint, initial_money):
        for i in range(iterations):
            total_profit = 0
            inventory = []
            state = self.get_state(0)
            starting_money = initial_money
            for t in range(0, len(self.trend) - 1, self.skip):
                action = self.act(state)
                next_state = self.get_state(t + 1)
                
                if action == 1 and starting_money >= self.trend[t] and t < (len(self.trend) - self.half_window):
                    inventory.append(self.trend[t])
                    starting_money -= self.trend[t]
                
                elif action == 2 and len(inventory) > 0:
                    bought_price = inventory.pop(0)
                    total_profit += self.trend[t] - bought_price
                    starting_money += self.trend[t]
                    
                invest = ((starting_money - initial_money) / initial_money)
                self.memory.append((state, action, invest, 
                                    next_state, starting_money < initial_money))
                state = next_state
                batch_size = min(self.batch_size, len(self.memory))
                cost = self.replay(batch_size)
            if (i+1) % checkpoint == 0:
                print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost,
                                                                                  starting_money))

In [None]:
close = data.Close.values.tolist()
initial_money = 10000
window_size = 30
skip = 1
batch_size = 32
agent = DQNAgent(state_size = window_size, 
              window_size = window_size, 
              trend = close, 
              skip = skip, 
              batch_size = batch_size)
agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money)

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
epoch: 10, total rewards: 182.010000.3, cost: 0.594407, total money: 10182.010000
epoch: 20, total rewards: 22.143000.3, cost: 0.575513, total money: 10022.143000
epoch: 30, total rewards: 49.528000.3, cost: 0.278911, total money: 10049.528000
epoch: 40, total rewards: 171.943000.3, cost: 0.260734, total money: 10171.943000
epoch: 50, total rewards: 332.648000.3, cost: 0.421622, total money: 10332.648000
epoch: 60, total rewards: 261.710000.3, cost: 0.175892, total money: 10261.710000


KeyboardInterrupt: ignored