In [47]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [48]:
from collections import namedtuple, deque
import math
import random
import wandb

## ENV

In [49]:
!pip install -q --upgrade --force-reinstall --no-deps kaggle

In [50]:
# from google.colab import files

In [51]:
# files.upload()

In [52]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/home/anirudh/.kaggle’: File exists


In [53]:
!mv ./kaggle.json ~/.kaggle/.

mv: cannot stat './kaggle.json': No such file or directory


In [54]:
!chmod 600 ~/.kaggle/kaggle.json

In [57]:
!kaggle datasets list --user gmshroff

ref                                title                     size  lastUpdated          downloadCount  voteCount  usabilityRating  
---------------------------------  -----------------------  -----  -------------------  -------------  ---------  ---------------  
datasets/gmshroff/market-data-cl   Market Data CL           404MB  2022-02-11 12:24:49             13          1  0.23529412       
datasets/gmshroff/market-data      Market Data               39MB  2022-02-07 11:11:00             59          1  0.125            
datasets/gmshroff/few-shot-arc     Few-shot version of ARC  609KB  2022-02-14 11:03:57             82          0  0.25             
datasets/gmshroff/options-dataset  Options Dataset          151MB  2022-02-07 17:01:18              5          0  0.0              
datasets/gmshroff/market-data-rl   Market Data RL           148MB  2022-02-09 12:25:02             47          1  0.29411766       
datasets/gmshroff/few-shot-nar     Few-shot version of NAR  370MB  2022-02-0

In [None]:
!kaggle datasets download -d gmshroff/market-data-rl

Downloading market-data-rl.zip to /home/anirudh/test
100%|███████████████████████████████████████▉| 148M/148M [00:45<00:00, 5.05MB/s]
100%|████████████████████████████████████████| 148M/148M [00:45<00:00, 3.44MB/s]


In [None]:
!unzip market-data-rl

Archive:  market-data-rl.zip
  inflating: dfrl.csv                


In [56]:
!ls

__pycache__  fqn.pt			   meta-envs  train.py
close.png    log.txt			   models     train_contrast.py
dfrl.csv     market-data-rl.zip		   og.png     visualise.py
far.png      marketEnvForRLprojects.ipynb  psi.pt     wandb


In [58]:
import pandas as pd
import numpy as np

In [59]:
class marketEnv():
    def __init__(self,df,symbol,day,target,stoploss,txcost=.001):
        self.df=df.loc[(df['day']==day)&(df['sym']==symbol)]
        self.symbol=symbol
        self.day=day
        self.t=target
        self.l=stoploss
        self.sl=stoploss
        self.time=0
        self.end=self.df.shape[0]
        self.r=self.df['Close'].values
        self.done=False
        self.mv=self.df.iloc[0]['Open']
        self.txcost=txcost
    def thresh(self,x,pos):
        #x=current_price,pos=position_taken(+-)
        if pos>0:
            if x>pos+self.t*pos or x<pos-self.l*pos: return 1
            else: return 0
        elif pos<0:
            if x<abs(pos)-self.t*abs(pos) or x>abs(pos)+self.l*abs(pos): return 1
            else: return 0
    def partial_thresh(self,x,pos):
        #x=current_price,pos=position_taken(+-)
        if pos>0:
            if x>pos+self.t*pos/2: return 1
            else: return 0
        elif pos<0:
            if x<abs(pos)-self.t*abs(pos)/2: return 1
            else: return 0
    def get_state(self):
        return self.df.iloc[0:self.time+1]
    def step(self,action):
        if action==0: 
            ret=0.0
            if self.time+1==self.end: self.done=True
            else: self.time+=1
            return self.df.iloc[0:self.time+1],ret,self.done
        else: 
            r=self.r
            pos=action*r[self.time]
            while True:
                thresh_met=self.thresh(self.r[self.time],pos)
                if thresh_met: break
                if self.time+1==self.end:
                    self.done=True
                    break
                else: self.time+=1
                if self.partial_thresh(self.r[self.time],pos): self.l=-self.t/2
            if pos>0: ret=r[self.time]-pos-r[self.time]*self.txcost
            elif pos<0: ret=abs(pos)-r[self.time]-r[self.time]*self.txcost
            ret=ret*100/self.mv
            self.l=self.sl
            return self.df.iloc[0:self.time+1],ret,self.done

In [60]:
def run_episode(env,policy):
    env.time=0
    state=env.get_state()
    rewards=[]
    tot=0.0
    done=False
    while done==False:
        action=policy(state)
        state,rew,done=env.step(action)
        tot+=rew
        rewards+=[rew]
        #print(state.shape[0],rew,tot)
    return tot,rewards

In [61]:
def always_buy(df):
    return 1
def always_sell(df):
    return -1

In [62]:
dfrl=pd.read_csv('./dfrl.csv')

In [63]:
# The symbols in each group are those of securities in related industries
groups={0: [189, 167],
 1: [269, 236, 251],
 2: [116, 280, 90],
 3: [137, 31, 231],
 4: [133, 185, 23]}

In [64]:
# symbols=groups[0]
symbols = dfrl['sym'].unique()

In [65]:
days=dfrl['day'].unique()

In [66]:
env=marketEnv(df=dfrl,symbol=symbols[0],day=days[10],target=0.005,stoploss=0.05)

In [67]:
ret,_=run_episode(env,always_buy)

In [68]:
ret

-4.157583752307264

In [69]:
tot_ret=0
print(symbols[0])
for day in days:
    env=marketEnv(df=dfrl,symbol=symbols[0],day=day,target=0.005,stoploss=0.05)
    ret,_=run_episode(env,always_sell)
    tot_ret+=ret
    print(ret,tot_ret)

0
-3.028761447647555 -3.028761447647555
-0.6341861778405496 -3.6629476254881044
1.0396762596056774 -2.623271365882427
0.9943672842462492 -1.6289040816361777
-1.4857681986284432 -3.114672280264621
0.07059797568200832 -3.0440743045826126
-0.8808627216958724 -3.924937026278485
0.15384615196145823 -3.771090874317027
0.5314172895636454 -3.2396735847533815
5.327332397290714 2.0876588125373328
2.6175343138307 4.705193126368033
-4.319080115822993 0.3861130105450403
3.2405141355946205 3.626627146139661
-4.680242481021747 -1.0536153348820858
-5.313698188299165 -6.3673135231812505
2.9801916426971062 -3.3871218804841443
0.891284678668181 -2.4958372018159634
-0.7405691762558804 -3.2364063780718437
-0.4317344981682295 -3.668140876240073
-0.4860384316279848 -4.154179307868058


In [70]:
env.get_state().shape

(77, 29)

## Models

In [113]:
env=marketEnv(df=dfrl,symbol=symbols[0],day=days[10],target=0.005,stoploss=0.05)

In [114]:
env.get_state()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Open_prev,High_prev,Low_prev,Close_prev,...,BBL_5_2.0,BBM_5_2.0,BBU_5_2.0,BBB_5_2.0,BBP_5_2.0,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,day,sym
831,910.0,910.0,907.0,907.0,0.0,0.0,989.5,1000.049988,909.950012,922.049988,...,905.556341,915.809998,926.063654,2.239254,0.070397,-5.370082,-0.351575,-5.018507,10,0


In [115]:
env.time

0

In [110]:
env.get_state()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Open_prev,High_prev,Low_prev,Close_prev,...,BBL_5_2.0,BBM_5_2.0,BBU_5_2.0,BBB_5_2.0,BBP_5_2.0,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,day,sym
831,910.0,910.0,907.0,907.0,0.0,0.0,989.5,1000.049988,909.950012,922.049988,...,905.556341,915.809998,926.063654,2.239254,0.070397,-5.370082,-0.351575,-5.018507,10,0
832,907.0,907.0,907.0,907.0,19.0,0.0,989.5,1000.049988,909.950012,922.049988,...,901.889237,914.409998,926.930758,2.738544,0.204092,-5.839757,-0.657,-5.182757,10,0
833,907.099976,907.099976,907.099976,907.099976,0.0,0.0,989.5,1000.049988,909.950012,922.049988,...,899.380629,912.029993,924.679356,2.773892,0.305128,-6.13321,-0.760363,-5.372848,10,0
834,907.099976,907.099976,907.099976,907.099976,20.0,0.0,989.5,1000.049988,909.950012,922.049988,...,898.049655,910.049988,922.050321,2.637291,0.377086,-6.29323,-0.736306,-5.556924,10,0
835,909.799988,909.799988,909.799988,909.799988,3.0,0.0,989.5,1000.049988,909.950012,922.049988,...,905.398171,907.599988,909.801804,0.485195,0.999587,-6.131498,-0.459659,-5.671839,10,0
836,908.299988,908.299988,908.299988,908.299988,1.0,0.0,989.5,1000.049988,909.950012,922.049988,...,905.696282,907.859985,910.023689,0.47666,0.601678,-6.054569,-0.306184,-5.748385,10,0


In [104]:
env.get_state()

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Open_prev,High_prev,Low_prev,Close_prev,...,BBL_5_2.0,BBM_5_2.0,BBU_5_2.0,BBB_5_2.0,BBP_5_2.0,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,day,sym
831,910.0,910.0,907.0,907.0,0.0,0.0,989.5,1000.049988,909.950012,922.049988,...,905.556341,915.809998,926.063654,2.239254,0.070397,-5.370082,-0.351575,-5.018507,10,0
832,907.0,907.0,907.0,907.0,19.0,0.0,989.5,1000.049988,909.950012,922.049988,...,901.889237,914.409998,926.930758,2.738544,0.204092,-5.839757,-0.657,-5.182757,10,0
833,907.099976,907.099976,907.099976,907.099976,0.0,0.0,989.5,1000.049988,909.950012,922.049988,...,899.380629,912.029993,924.679356,2.773892,0.305128,-6.13321,-0.760363,-5.372848,10,0
834,907.099976,907.099976,907.099976,907.099976,20.0,0.0,989.5,1000.049988,909.950012,922.049988,...,898.049655,910.049988,922.050321,2.637291,0.377086,-6.29323,-0.736306,-5.556924,10,0
835,909.799988,909.799988,909.799988,909.799988,3.0,0.0,989.5,1000.049988,909.950012,922.049988,...,905.398171,907.599988,909.801804,0.485195,0.999587,-6.131498,-0.459659,-5.671839,10,0
836,908.299988,908.299988,908.299988,908.299988,1.0,0.0,989.5,1000.049988,909.950012,922.049988,...,905.696282,907.859985,910.023689,0.47666,0.601678,-6.054569,-0.306184,-5.748385,10,0


In [72]:
EMBEDDING_DIMS = 16
NUM_ACTIONS = 3
INPUT_SIZE = 29

dims = [INPUT_SIZE, 64, 32, EMBEDDING_DIMS]

NUM_EPOCHS = 3000
SUCCESSOR_EPOCHS = 100
GATHERING_WALKS = 300
MAX_WALK_LENGTH = 100
T = 50

LEARNING_RATE = 0.005
GAMMA = 0.8
BATCH_SIZE = 32
MAX_CAPACITY = 1000
MAX_WALKS = 100
MAX_WALK_LENGTH = 100
TASK_EPISODES = 100

TRAINING_SYMBOLS = int(len(symbols) * 0.2)
TESTING_SYMBOLS =  len(symbols) - TRAINING_SYMBOLS


MAX_EPSILON = 1
MIN_EPSILON = 0.01
EPSILON_DECAY = 0.99

In [73]:
wandb.init('Market RL')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




### Model

In [74]:
class PSI(nn.Module):
    def __init__(self, dims):
        super().__init__()
        layers = []
        prev = dims[0]
        for next in dims[1:-1]:
            layers.append(nn.Linear(prev, next))
            layers.append(nn.ReLU())
            prev = next
        layers.append(nn.Linear(prev, dims[-1]))

        self.layers = nn.Sequential(*layers)
    
    def forward(self, state):
        return self.layers(state)

In [75]:
class FQN(nn.Module):
    def __init__(self, embedding_dim, num_actions):
        super().__init__()
        self.linear = nn.Linear(embedding_dim, num_actions, bias=False)
    
    def forward(self, successor):
        return self.linear(successor)

### Replay

In [76]:
from collections import namedtuple, deque
import random
import numpy as np
import torch

Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))

def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FQNReplayBuffer:

    def __init__(self, max_size):
        self.memory = deque([], max_size)
        self.max_size = max_size
        self.device = get_device()

    def push(self, state, action, next_state, reward, done):
        transition = Transition(state, action, next_state, reward, done)
        self.memory.append(transition)
    
    def sample(self, batch_size):
        sample_size = min(batch_size, len(self))
        sample = random.sample(self.memory, sample_size)
        return self.convert_to_batch(sample)

    def sample_to_torch(self, batch_size):
        sample = self.sample(batch_size)

        state = sample.state
        action = sample.action
        next_state = sample.next_state
        reward = sample.reward
        done = sample.done

        state = torch.from_numpy(state).float().to(self.device)
        action = torch.tensor(action, device=self.device).long()
        next_state = torch.from_numpy(next_state).float().to(self.device)
        reward = torch.tensor(reward, device=self.device).float()
        done = torch.tensor(done, device=self.device).float()

        return Transition(state, action, next_state, reward, done)
    
    def __len__(self):
        return len(self.memory)
    
    def convert_to_batch(self, transitions):
        batch = Transition(*zip(*transitions))

        # if isinstance(batch.state, tuple):
            # print(len(batch.state))
        np_state = np.stack(batch.state)
        np_action = np.array(batch.action)
        np_next_state = np.stack(batch.next_state)
        np_reward = np.array(batch.reward)
        np_done = np.array(batch.done)

        np_batch = Transition(np_state, np_action, np_next_state, np_reward, np_done)

        return np_batch
    
    def is_full(self):
        return len(self.memory) == self.max_size

In [77]:
class PSIReplayBuffer:
    def __init__(self, max_walks):
        self.memory = deque([], max_walks)
        self.max_walks = max_walks
        self.device = get_device()
    
    def push(self, states):
        states = np.stack(states)
        actions = np.stack(actions)
        self.memory.append(states)
    
    def __len__(self):
        return len(self.memory)
    
    def sample(self, batch_size):
        samples = random.sample(self.memory, batch_size)
        return samples
    
    def sample_to_torch(self, batch_size):
        samples = self.sample(batch_size)
        samples = [torch.from_numpy(states).float().to(self.device) for states in samples]

        return samples

### Agent

In [78]:
class Agent:
    def __init__(self, env, psi: nn.Module, fqn: nn.Module, device=None, lr=2e-3, gamma=0.8, batch_size=64,
                max_capacity=1000, max_walks=300, T=50):
        self.env = env
        self.psi = psi
        self.fqn = fqn
        self.device = device if device is not None else self.get_device()
        self.lr = lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.T = T
        self.psi_memory = PSIReplayBuffer(max_walks)
        self.fqn_memory = FQNReplayBuffer(max_capacity)
        self.psi_optimizer = torch.optim.Adam(self.psi.parameters(), lr=lr)
        self.fqn_optimizer = torch.optim.Adam(self.fqn.parameters(), lr=lr)
    
    def get_device(self):
        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    def train_psi(self, num_walks):
        walks = self.psi_memory.sample(num_walks)
        objectives = []
        for states in walks:
            objective = self.calculate_psi_objective(states)
            objectives.append(objective)
        
        loss = -1 * torch.stack(objectives).mean() # Multiplied by -1 as we need to maximize the objective

        self.psi_optimizer.zero_grad()
        loss.backward()
        self.psi_optimizer.step()

        return loss
    
    def calculate_psi_objective(self, states) -> torch.Tensor:
        num_states = len(log_successor_matrix)

        successors = self.psi(states)
        successor_matrix = torch.matmul(successors, successors.t())
        log_successor_matrix = F.logsigmoid(successor_matrix)
        selector = torch.triu(torch.ones(num_states, num_states), 1)
        selector = torch.tril(selector, self.T)
        log_successor_matrix = log_successor_matrix.mul(selector)

        log_discount_matrix = torch.arange(num_states, device=self.device).repeat(num_states, 1)
        log_discount_matrix = log_discount_matrix - torch.arange(1, num_states + 1, device=self.device).unsqueeze(-1)
        log_discount_matrix = torch.triu(log_discount_matrix, 1)
        log_discount_matrix = torch.tril(log_discount_matrix, self.T)
        log_discount_matrix = log_discount_matrix * math.log(self.gamma)
        
        log_discounted_successor_matrix = log_successor_matrix + log_discount_matrix

        objective = log_discounted_successor_matrix.sum()
        return objective
    
    def train_fqn(self):
        state, action, reward, next_state, done = self.fqn_replay.sample_to_torch(self.batch_size)

        with torch.no_grad():
            successor_feature = self.psi(state)
            next_successor_feature = self.psi(next_state)

        with torch.no_grad():
            next_q_values = self.fqn(next_successor_feature)
        optimal_next_q_value, _ = next_q_values.max(1)
        target_q_values = reward + (self.gamma * optimal_next_q_value * (1 - done))

        input_q_values = self.get_q_value(successor_feature)
        q_values = input_q_values.gather(1, action.unsqueeze(1)).squeeze()

        loss = F.smooth_l1_loss(q_values, target_q_values)

        self.fqn_optimizer.zero_grad(set_to_none=True)
        loss.backward()
        self.fqn_optimizer.step()

        return loss
    
    def act(self, state, epsilon=0):
        if random.random() < epsilon:
            return np.random.randint(0, NUM_ACTIONS)
        
        with torch.no_grad():
            state = torch.from_numpy(state).float().to(self.device)
            successor_feature = self.psi(state)
            q_values = self.fqn(successor_feature)
            action = q_values.argmax(1).item()
            return action
    
    def reset_fqn(self):
        self.fqn = FQN(EMBEDDING_DIMS, NUM_ACTIONS)


### Training

In [79]:
fqn_model = FQN(EMBEDDING_DIMS, NUM_ACTIONS)
psi_model = PSI(dims)
agent = Agent(env, psi_model, fqn_model, lr=LEARNING_RATE, gamma=GAMMA, batch_size=BATCH_SIZE, max_capacity=MAX_CAPACITY, max_walks=MAX_WALKS, T=T)

In [80]:
# Initial data gathering

NUM_DAYS = 10
NUM_SYMBOLS = MAX_WALKS // NUM_DAYS
SAMPLED_SYMBOLS = random.sample(list(symbols[:TRAINING_SYMBOLS]), NUM_SYMBOLS)


for symbol in SAMPLED_SYMBOLS:
    for day in days[:NUM_DAYS]:

        # for episode in range(TASK_EPISODES):
        env=marketEnv(df=dfrl,symbol=symbol,day=day,target=0.005,stoploss=0.05)

        states = []

        done = False
        state = env.get_state().to_numpy()
        while not done:
            action = agent.act(state, 1) # Complete random action
            next_state, reward, done = env.step(action)
            next_state = next_state.to_numpy()
            agent.fqn_memory.push(state, action, next_state, reward, done)
            states.append(next_state)
            state = next_state
        agent.psi_memory.push(states)

ValueError: all input arrays must have the same shape

In [86]:
states[0].shape

(2, 29)

In [None]:
for epoch in range(NUM_EPOCHS):

    # Train FQN

    cummalative_reward = 0
    
    for i in range(TRAINING_SYMBOLS): # 4 symbols for training and 1 symbol for testing
        for day in days:

            average_episode_reward = 0

            for episode in range(TASK_EPISODES):
                env = marketEnv(df=dfrl,symbol=symbol,day=day,target=0.005,stoploss=0.05)
                current_epsilon = MAX_EPSILON - MIN_EPSILON

                state = env.get_state().to_numpy()
                states = []
                done = False
                total_reward = 0

                episode_fqn_loss = []

                while not done:
                    action = agent.act(state, current_epsilon + MIN_EPSILON)
                    next_state, reward, done = env.step(action)
                    total_reward += reward
                    next_state = next_state.to_numpy()
                    agent.fqn_memory.push(state, action, next_state, reward, done)
                    states.append(next_state)
                    state = next_state
                    
                    # Optimization
                    iteration_fqn_loss = agent.train_fqn()
                    episode_fqn_loss.append(iteration_fqn_loss)

                agent.psi_memory.push(states)
                episode_fqn_loss = torch.stack(episode_fqn_loss).mean()
                average_episode_reward += total_reward
                # Log episode statistics
                wandb.log({
                    'Episode FQN Loss': episode_fqn_loss.item(),
                    'Episode Total Reward': total_reward,
                    'Episode': episode+1
                })
            
            average_episode_reward /= TASK_EPISODES
            cummalative_reward += average_episode_reward

    NUM_TASKS = TRAINING_SYMBOLS * len(days)
    cummalative_reward /= NUM_TASKS


    # Gathering walks

    NUM_DAYS = 10
    NUM_SYMBOLS = MAX_WALKS // NUM_DAYS
    SAMPLED_SYMBOLS = random.sample(list(symbols[:TRAINING_SYMBOLS]), NUM_SYMBOLS)


    for symbol in SAMPLED_SYMBOLS:
        for day in days[:NUM_DAYS]:

            # for episode in range(TASK_EPISODES):
            env=marketEnv(df=dfrl,symbol=symbol,day=day,target=0.005,stoploss=0.05)

            states = []

            done = False
            state = env.get_state().to_numpy()
            while not done:
                action = agent.act(state, 0) # Complete exploitation
                next_state, reward, done = env.step(action)
                next_state = next_state.to_numpy()
                # agent.fqn_memory.push(state, action, next_state, reward, done)
                states.append(next_state)
                state = next_state
            agent.psi_memory.push(states)



    # Train PSI
    for _ in range(SUCCESSOR_EPOCHS):
        psi_loss = agent.train_psi(MAX_WALKS)    

    # Logging epoch statistics
    wandb.log({
        'Epoch': epoch+1,
        'Cummalative Reward': cummalative_reward,
        'PSI Loss': psi_loss,
        'FQN Loss': episode_fqn_loss
    })