**<h1> Algorithmic Trading Using Deep Q Learning**

In [None]:
import time
import copy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
from plotly import tools
from plotly.graph_objs import *
from plotly.offline import init_notebook_mode, iplot, iplot_mpl

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**<h2>Loading Dataset**

In [None]:
data = pd.read_csv('drive/My Drive/MSFT.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date')
print(data.index.min(), data.index.max())
data.head()

2010-01-04 00:00:00 2019-12-31 00:00:00


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,30.620001,31.1,30.59,30.950001,24.294369,38409100
2010-01-05,30.85,31.1,30.639999,30.959999,24.302216,49749600
2010-01-06,30.879999,31.08,30.52,30.77,24.15307,58182400
2010-01-07,30.629999,30.700001,30.190001,30.450001,23.901886,50559700
2010-01-08,30.280001,30.879999,30.24,30.66,24.066734,51197400


In [None]:
date_split = '2016-01-01'
train = data[:date_split]
test = data[date_split:]
len(train),len(test)

(1510, 1006)

**MDP Environment**

In [None]:
class Environment1:
    
    def __init__(self, data, history_t=90, cash_in_hand = 1000):
        self.data = data
        self.history_t = history_t
        self.reset()
        self.cash_in_hand = cash_in_hand
        
    def reset(self):
        self.t = 0
        self.done = False
        self.profits = 0
        self.positions = []
        self.position_value = 0
        self.history = [0 for _ in range(self.history_t)]
        return [self.position_value] + self.history # obs
    
    def step(self, act):
        reward = -1
        
        # act = 0: stay, 1: buy, 2: sell
        if act == 1:
          if self.cash_in_hand < self.data.iloc[self.t,:]['Close']:
            reward = -100
          self.positions.append(self.data.iloc[self.t, :]['Close'])
          self.cash_in_hand -= self.data.iloc[self.t, :]['Close']

        elif act == 2: 
            if len(self.positions) == 0:
                reward = -100
            else:
                profits = 0
                for p in self.positions:
                    profits += (self.data.iloc[self.t, :]['Close'] - p)
                    self.cash_in_hand += self.data.iloc[self.t, :]['Close']
                reward += profits
                self.profits += profits
                self.positions = []
        
        # set next time
        self.t += 1
        
        self.position_value = 0
        for p in range(len(self.positions)):
            self.position_value += (self.data.iloc[self.t, :]['Close'] - self.positions[p])
        self.history.pop(0)
        self.history.append(self.data.iloc[self.t, :]['Close'] - self.data.iloc[(self.t-1), :]['Close'])
        if (self.t==len(self.data)-1):
            self.done=True
        # clipping reward
        if reward > 0:
            reward = 1
        elif reward < 0:
            reward = -1
        #print ("t={%d}, done={%str}"%(self.t,self.done))
        return [self.position_value] + self.history, reward, self.cash_in_hand, self.done # obs, reward, done
    

In [None]:
env = Environment1(train)

**<h2>Deep Q-Network</h2>**<br>
The model involves a 3 layer neural network with hidden_size = 100 (hyperparmater)

In [None]:
#def train_dqn(env):
class Q_Network(nn.Module):
  def __init__(self,obs_len,hidden_size,actions_n):
    super(Q_Network,self).__init__()

    self.fc_val = nn.Sequential(
        nn.Linear(obs_len, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, actions_n),
        nn.Softmax(dim=1)
    )

  def forward(self,x):
    h = self.fc_val(x)
    return (h)            

**Setting up parameters**

In [None]:
hidden_size=100
input_size=env.history_t+1
output_size=3
USE_CUDA = False
LR = 0.001

In [None]:
Q = Q_Network(input_size, hidden_size, output_size)

In [None]:
Q_ast = copy.deepcopy(Q)

In [None]:
if USE_CUDA:
    Q = Q.cuda()
loss_function = nn.MSELoss()

#defineing the optimizer
optimizer = optim.Adam(list(Q.parameters()), lr=LR)

 **<h2>Deep Q-learning</h2>**<br>

---
 
1.   Initialize replay memory capacity
2.   Initialize the network with random weights
3.   For each time step: 

>*   Select an action (via exploration or exploitation)
>*   Execute selected action in an emulator
>*   Observe reward and next state
>*   Store experience in replay memory

















In [None]:
epoch_num = 50
step_max = len(env.data)-1
memory_size = 200
batch_size = 50
gamma = 0.97

In [None]:
memory = []  #Replay Memory
total_step = 0
total_rewards = []
total_losses = []
epsilon = 1.0  #exploration rate
epsilon_decrease = 1e-3
epsilon_min = 0.1
start_reduce_epsilon = 200
train_freq = 10
update_q_freq = 20
gamma = 0.97  #discount rate
show_log_freq = 5

In [None]:
start = time.time()
for epoch in range(epoch_num):

    pobs = env.reset()
    step = 0
    done = False
    total_reward = 0
    total_loss = 0
    cash_in_hand = 1000

    while not done and step < step_max:

        # select act using exploration
        pact = np.random.randint(3)

        # select act using exploitation
        if np.random.rand() > epsilon:
            pact = Q(torch.from_numpy(np.array(pobs, dtype=np.float32).reshape(1, -1)))
            pact = np.argmax(pact.data)
            pact = pact.numpy()

        # act
        obs, reward, cash_in_hand, done = env.step(pact)

        # add memory
        memory.append((pobs, pact, reward, obs, cash_in_hand, done))
        if len(memory) > memory_size:
            memory.pop(0)

        # train or update q
        if len(memory) == memory_size:
            if total_step % train_freq == 0:
                shuffled_memory = np.random.permutation(memory)   #taking random samples in order to break the correlation between consecutive samples
                memory_idx = range(len(shuffled_memory))
                for i in memory_idx[::batch_size]:
                    batch = np.array(shuffled_memory[i:i+batch_size])
                    b_pobs = np.array(batch[:, 0].tolist(), dtype=np.float32).reshape(batch_size, -1)
                    b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32)
                    b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
                    b_obs = np.array(batch[:, 3].tolist(), dtype=np.float32).reshape(batch_size, -1)
                    b_cash_in_hand = np.array(batch[:,4].tolist(), dtype = np.float32)
                    b_done = np.array(batch[:, 5].tolist(), dtype=np.bool)

                    q = Q(torch.from_numpy(b_pobs))
                    q_ = Q_ast(torch.from_numpy(b_obs))
                    maxq = np.max(q_.data.numpy(),axis=1)
                    target = copy.deepcopy(q.data)
                    for j in range(batch_size):
                        target[j, b_pact[j]] = b_reward[j]+gamma*maxq[j]*(not b_done[j])   #Bellman equation
                    Q.zero_grad() #clear the previous gradients
                    loss = loss_function(q, target) #compute loss
                    total_loss += loss.data.item()
                    loss.backward() #compute gradients
                    optimizer.step()  #adjust weights
                    
            if total_step % update_q_freq == 0:
                Q_ast = copy.deepcopy(Q)
                
            # update epsilon
            if epsilon > epsilon_min and total_step > start_reduce_epsilon:
                epsilon -= epsilon_decrease

            # next step
            total_reward += reward
            pobs = obs
            step += 1
            total_step += 1

        total_rewards.append(total_reward)
        total_losses.append(total_loss)

        if (epoch+1) % show_log_freq == 0:
            log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
            log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
            elapsed_time = time.time()-start
            print('\t'.join(map(str, [epoch+1, epsilon, total_step, log_reward, log_loss, elapsed_time])))
            start = time.time()
            
#return Q, total_losses, total_rewards

**Testing**

In [None]:
test_env = Environment1(test)
pobs = test_env.reset()
test_acts = []
test_rewards = []
current_cash_in_hand = []

for _ in range(len(test_env.data)-1):
    
    pact = Q(torch.from_numpy(np.array(pobs, dtype=np.float32).reshape(1, -1)))
    pact = np.argmax(pact.data)
    test_acts.append(pact.item())
            
    obs, reward, cash_in_hand, done = test_env.step(pact.numpy())
    test_rewards.append(reward)
    current_cash_in_hand.append(cash_in_hand)

    pobs = obs
        
test_profits = test_env.profits

In [None]:
test_profits

801.5499329999996