<a href="https://colab.research.google.com/github/Amanj5486/Finance_ML/blob/main/Reinforcement_Learning_of_Stocks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%matplotlib inline

import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import tensorflow as tf




In [None]:
is_ipython = 'inline'  in matplotlib.get_backend()
if is_ipython: from IPython import display


In [None]:
class DQN(nn.Module):
  def __init__(self,input_size):
    super().__init__()
    self.fc1 = nn.Linear(in_features=input_size,out_features=64)
    self.fc2 = nn.Linear(in_features=64,out_features=32)
    self.fc3 = nn.Linear(in_features=32,out_features=8)
    self.out = nn.Linear(in_features=8,out_features=3)
  def forward(self,t):
    t = t.flatten(start_dim=1)
    t = F.relu(self.fc1(t))
    t = F.relu(self.fc2(t))
    t = F.relu(self.fc3(t))
    t = self.out(t)
    return t  


In [None]:
Experience = namedtuple('Experience',('state','action','next_state','reward'))

In [None]:
class ReplayMemory():
  def __init__(self,capacity):
    self.capacity = capacity
    self.memory = []
    self.push_count =0
  def push(self,experience):
    if len(self.memory)<self.capacity:
      self.memory.append(experience)
    else:
      self.memory[self.push_count%self.capacity] = experience
      self.push_count+=1
  def sample(self,batch_size):
    return random.sample(self.memory,batch_size)
  def can_provide_sample(self,batch_size):
   # print(len(self.memory))
    return len(self.memory)>=batch_size        

In [None]:
class EpsilonGreedyStrategy():
  def __init__(self,start,end,decay):
    self.start = start
    self.end = end
    self.decay = decay
  def get_exploration_rate(self,current_step):
    return self.end + (self.start-self.end)*math.exp(-1.*current_step*self.decay)


In [None]:
class Agent():
  def __init__(self,strategy,num_actions,device):
    self.current_step = 0 
    self.strategy = strategy
    self.num_actions = num_actions
    self.device = device 
  def select_action(self,state,policy_net):
    rate = strategy.get_exploration_rate(self.current_step)
    self.current_step +=1
    if rate> random.random():
      action =  random.randrange(self.num_actions)  # explore
      return torch.tensor([action]).to(device)
    else:
      with torch.no_grad():
        #print(state,"f1")
        return torch.tensor(policy_net(state).argmax(dim=1)).to(device) # exploit




In [None]:
def formatPrice(n):
    return("-Rs." if n<0 else "Rs.")+"{0:.2f}".format(abs(n))
def getStockDataVec():
    vec = []
    lines = open("/content/GOOG.csv","r").read().splitlines()
    for line in lines[1:2769]:
        vec.append(float(line.split(",")[4]))
    return vec 
def sigmoid(x):
    return 1/(1+math.exp(-x))

def getState(data, t, n):
    d = t - n + 1
    block = data[d:t + 1] if d >= 0 else -d * [data[0]] + data[0:t + 1] # pad with t0
    res = []
    for i in range(n - 1):
        res.append(sigmoid(block[i + 1] - block[i]))
    return np.array([res])


In [None]:
def plot(values, moving_avg_period):
    plt.figure(2)
    plt.clf()        
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(values)
    plt.plot(get_moving_average(moving_avg_period, values))
    plt.pause(0.001)
    if is_ipython: display.clear_output(wait=True)


In [None]:
def get_moving_average(period, values):
    values = torch.tensor(values, dtype=torch.float)
    if len(values) >= period:
        moving_avg = values.unfold(dimension=0, size=period, step=1).mean(dim=1).flatten(start_dim=0)
        moving_avg = torch.cat((torch.zeros(period-1), moving_avg))
        return moving_avg.numpy()
    else:
        moving_avg = torch.zeros(len(values))
        return moving_avg.numpy()


In [None]:
def extract_tensors(experiences):
    # Convert batch of Experiences to Experience of batches
    batch = Experience(*zip(*experiences))

    t1 = torch.cat(batch.state)
    t2 = torch.cat(batch.action)
    t3 = torch.cat(batch.reward)
    t4 = torch.cat(batch.next_state)

    return (t1,t2,t3,t4)


In [None]:
class QValues():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    def get_current(policy_net, states, actions):
      return policy_net(states).gather(dim=1, index=actions.unsqueeze(-1))
    def get_next(target_net, next_states):
      final_state_locations = next_states.flatten(start_dim=1).max(dim=1)[0].eq(0).type(torch.bool)
      non_final_state_locations = (final_state_locations == False)
      non_final_states = next_states[non_final_state_locations]
      batch_size = next_states.shape[0]
      values = torch.zeros(batch_size)#.to(QValues.device)
      values[non_final_state_locations] = target_net(non_final_states).max(dim=1)[0].detach()
      return values  

In [None]:
batch_size = 256
gamma = 0.9999
eps_start = 1
eps_end = 0.01
eps_decay = 0.001
target_update  = 10
memory_size = 100000
lr = 0.01
num_episodes = 50





In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#em = CartPoleEnvManager(device)
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)
agent = Agent(strategy, 3,device)
memory = ReplayMemory(memory_size)
vec = getStockDataVec()
k=2267
#print(k)

In [None]:

policy_net = DQN(64).to(device)
target_net = DQN(64).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)
window_size = 64

In [None]:

episode_durations = []
for episode in range(num_episodes):
    print("Episode " + str(episode) + "/" + str(num_episodes))
    state = torch.tensor(getState(vec, 0, window_size + 1))
    total_profit = 0
    invent = []
    #state = getState(vec,)
    max_transaction = 50 
    total_money = 10000
    c_s_h = 0
    c_t_c =0
    for t in range(k-1):
      action = agent.select_action(state.float(), policy_net.float())
      #print(1,action)
      next_state =  torch.tensor(getState(vec, t+1, window_size + 1))
      reward =0
      if action == 0 and c_t_c < max_transaction and total_money>0:
       # print("f2")
        x=total_money/(max_transaction-c_t_c) 
        total_money = total_money - x
        c_t_c +=1
        
        x= x/vec[t]
        c_s_h += x
        a=[]
        a.append(x)
        a.append(vec[t])
        invent.append(a)
         #print("Buy: " + formatPrice(data[t]))
      elif action ==1 and len(invent)>0:
        #print("f1")
        b_p = invent.pop(0)
        reward = vec[t]*b_p[0]-b_p[0]*b_p[1]
        total_money += vec[t]*b_p[0]
        total_profit += reward 
        #print(reward)
        c_s_h = c_s_h - b_p[0]
        c_t_c =0
      elif action ==2 and len(invent)>0:
        b_p = invent[0]
        #print(len(invent))
        reward = -vec[t]*b_p[0]+b_p[0]*b_p[1]  
      #action = torch.tensor(action)  
     
      #print(action.shape)
      reward = torch.tensor([reward])
      memory.push(Experience(state, action, next_state, reward))
      state = next_state
      if memory.can_provide_sample(batch_size):
        experiences = memory.sample(batch_size)
        states, actions, rewards, next_states = extract_tensors(experiences)
        current_q_values = QValues.get_current(policy_net, states.float(), actions)
        next_q_values = QValues.get_next(target_net, next_states.float())
        target_q_values = (next_q_values * gamma) + rewards
        loss = F.mse_loss(current_q_values, target_q_values.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
      done = True if t == k-2 else False  
      if done:
        # episode_durations.append(t)
        # plot(episode_durations, 100)
         break
    print(total_profit)    
    if episode % target_update == 0:
      target_net.load_state_dict(policy_net.state_dict())
 


Episode 0/50


  app.launch_new_instance()


3909.9197654184177
Episode 1/50
22226.36941567988
Episode 2/50
1743.3526002522772
Episode 3/50
3165.3886265471306
Episode 4/50
2553.354734347198
Episode 5/50
279.8087408007957
Episode 6/50
250.2837543276607
Episode 7/50
347.732669708431
Episode 8/50
32.97980378172548
Episode 9/50
174.31105442896344
Episode 10/50
207.67136207441848
Episode 11/50
355.81776959694776
Episode 12/50
122.55784065034703
Episode 13/50
11.038777347460467
Episode 14/50
-13.972749441482478
Episode 15/50
68.41520544695476
Episode 16/50
143.87351059486423
Episode 17/50
-12.967523090273119
Episode 18/50
21.215939377419403
Episode 19/50
-7.363817630017024
Episode 20/50
19.130799774604213
Episode 21/50
149.3529138527621
Episode 22/50
91.54301138668092
Episode 23/50


KeyboardInterrupt: ignored

In [None]:
print(total_profit)

809.8879531428898


In [None]:
vec = vec[2267:2769]
print(len(vec))
k=len(vec)
total_profit = 0
invent = []
print(k)

500
500


In [None]:
max_transaction = 50
total_money = 10000
c_s_h = 0
c_t_c =0
print(k)
for t in range(k-1):
      action = agent.select_action(state.float(), policy_net.float())
      #print(action)
      next_state =  torch.tensor(getState(vec, t+1, window_size + 1))
      reward =0
      if action == 0 and c_t_c < max_transaction and total_money>0:
       # print("f2")
        x=total_money/(max_transaction-c_t_c) 
        total_money = total_money - x
        c_t_c +=1
        
        x= x/vec[t]
        c_s_h += x
        a=[]
        a.append(x)
        a.append(vec[t])
        invent.append(a)
        print("Buy: " + formatPrice(vec[t]))
      elif action ==1 and len(invent)>0:
        #print("f1")
        b_p = invent.pop(0)
        reward = vec[t]*b_p[0]-b_p[0]*b_p[1]
        total_money += vec[t]*b_p[0]
        total_profit += reward 
        print('profit :', total_profit)
        #print(reward)
        c_s_h = c_s_h - b_p[0]
        c_t_c =0
      elif action ==2 and len(invent)>0:
        b_p = invent[0]
        #print(len(invent))
       # print('f1')
        reward = -vec[t]*b_p[0]+b_p[0]*b_p[1]  
      #action = torch.tensor(action)  
     
      #print(action.shape)
      reward = torch.tensor([reward])
      memory.push(Experience(state, action, next_state, reward))
      state = next_state
      
      done = True if t == k-2 else False  
      if done:
        #episode_durations.append(timestep)
        #plot(episode_durations, 100)
        break
print(total_profit)
print(total_money)

500
Buy: Rs.1106.94
profit : 4.524191073524548
Buy: Rs.1089.52
profit : 5.493869039792116
Buy: Rs.1095.06
profit : 8.158191588494788
Buy: Rs.1021.57
profit : 10.160672011338619
Buy: Rs.1006.47
profit : 13.87444680002568
Buy: Rs.1019.97
profit : 16.33676195977219
Buy: Rs.1048.21
profit : 17.594301567678315
Buy: Rs.1082.76
profit : 20.334704410597908
Buy: Rs.1081.77
profit : 19.745573544166632
Buy: Rs.1079.24
Buy: Rs.1075.66
profit : 27.22111301131227
profit : 38.626362143023556
Buy: Rs.1169.84
Buy: Rs.1103.98
Buy: Rs.1115.65
profit : 31.35293931963622
profit : 40.238737113062484
profit : 52.44542504918181
Buy: Rs.1198.80
Buy: Rs.1186.96
profit : 63.3296875830423
profit : 77.11224909901622
Buy: Rs.1223.71
profit : 80.16080561071638
Buy: Rs.1061.49
Buy: Rs.1044.41
profit : 85.25269924087505
profit : 94.90809948954526
Buy: Rs.1106.43
profit : 84.7605275281154
Buy: Rs.1016.06
profit : 95.60893787353032
Buy: Rs.1095.01
profit : 99.95140201541994
Buy: Rs.1193.20
profit : 99.97171619105279
Buy

  app.launch_new_instance()


Buy: Rs.1174.71
Buy: Rs.1167.26
profit : 109.37969657290733
profit : 109.48012863415389
Buy: Rs.1234.03
Buy: Rs.1218.76
profit : 111.52655072849532
profit : 115.28081873829638
Buy: Rs.1298.80
profit : 115.15620032132017
Buy: Rs.1320.70
profit : 114.35354537614523
Buy: Rs.1345.02
profit : 115.14312999679979
Buy: Rs.1348.84
profit : 114.35123449600954
114.35123449600954
10114.351234496013
