In [13]:
!pip install numpy --user
!pip install pandas --user



In [3]:
cd /Users/alexisrajab/Desktop/3A/SM10/RL/Cryptocurrency-Trading-bot-using-RL

/Users/alexisrajab/Desktop/3A/SM10/RL/Cryptocurrency-Trading-bot-using-RL


In [4]:
import numpy as np
import pandas as pd
#from sklearn.tree import DecisionTreeRegressor
from RLGlue.rl_glue import RLGlue

from src.environment import Environment
from src.agents import Agent

In [11]:
from RLGlue.environment import BaseEnvironment

import numpy as np

class EnvironmentREINFORCE(BaseEnvironment):
    """Implements the environment for an RLGlue environment
    Note:
        env_init, env_start, env_step, env_cleanup, and env_message are required
        methods.
    """

    actions = [0]

    def __init__(self):
        reward = None
        observation = None
        termination = None
        self.reward_obs_term = (reward, observation, termination)
        self.count = 0
        self.data = None
        self.seed = None
        self.time = 0
        self.max_time = None

    def get_full_obs(self):
        infos = ['weighted_positive_score','weighted_neutral_score','weighted_negative_score',
        'total', 'Bitcoin','BTC','BNB','ETH']

        return [self.data[key][self.time] for key in infos]
    
    def get_obs(self):
        infos = ['weighted_positive_score','weighted_neutral_score','weighted_negative_score',
        'total']

        return [self.data[key][self.time] for key in infos]
    
    def env_init(self, env_info={}):
        """Setup for the environment called when the experiment first starts.
        Note:
            Initialize a tuple with the reward, first state observation, boolean
            indicating if it's terminal.
        """
        self.max_time= env_info['max']
        self.data = env_info['data']
        local_observation = []
        local_observation = self.get_full_obs()
        self.time +=1

        self.reward_obs_term = (0.0, local_observation,False)



    def env_start(self):
        """The first method called when the experiment starts, called before the
        agent starts.
        Returns:
            The first state observation from the environment. Gives 5 points to agent
        """
        return self.reward_obs_term
    
    def NUPL(self,portfolio,current):
        """Method to get NUPLS fro each crypto according to agent portfolio
        Args:
            portfolio : agent portfolio
            current : the current market value for each crypto
        Returns:
            NUPLs
        """
        NUPLs = []
        for i,crypto in enumerate(['Bitcoin','BTC','BNB','ETH']):

            total = sum([nb for nb , value in portfolio[crypto]])
            if portfolio[crypto] != []:
                NUPLs.append(sum([(current[i] - value)*nb for nb , value in portfolio[crypto]]) / total*current[i])
            else:
                NUPLs.append(0)
        return NUPLs
    
    def update_agent_portfolio(self,market_values,action,portfolio,cash):
        """Method update virtually the agent portfolio according to the action he chose
        Args:
            portfolio : agent portfolio
            market_values : the current market value for each crypto
            action: the actions taken by the agent 
            cash : agent cash for teh buy action
        Returns:
            portfolio 
        """
        for i,crypto in enumerate(['Bitcoin','BTC','BNB','ETH']):
            if action[i] ==1:
                portfolio[crypto].append((cash/market_values[i] ,market_values[i]))
            if action[i] == 2:
                portfolio[crypto] = []
        return portfolio
    
    def env_step(self, action):
        """A step taken by the environment.
        Args:
            action: The action taken by the agent
        Returns:
            (float, state, Boolean): a tuple of the reward, state observation,
                and boolean indicating if it's terminal.
        """
        current = self.get_full_obs()[-4:][1] # get the value the agent tried to predict # BTC : index 1
        #portfolio , action_per_crypto , cash = action
        #portfolio = self.update_agent_portfolio(current,action_per_crypto,portfolio,cash) # update by the agent
        
        #reward = sum(self.NUPL(portfolio,current))
        self.time += 1

        obs = self.get_obs()
        
        if self.time != self.max_time:
            #self.reward_obs_term = (reward,(obs, current), False)
            self.reward_obs_term = (0,current, False) # the reward is computed afterwards
            
        else:
            #self.reward_obs_term = (reward, (obs, current), True)
            self.reward_obs_term = (0,current,True)
            
        return self.reward_obs_term

    def env_cleanup(self):
        """Cleanup done after the environment ends"""
        pass

    def env_message(self, message):
        """A message asking the environment for information
        Args:
            message (string): the message passed to the environment
        Returns:
            string: the response (or answer) to the message
        """
        if message == "what is the current reward?":
            return "{}".format(self.reward_obs_term[0])

        # else
        return "I don't know how to respond to your message"

## Predictions

In [6]:
pred = pd.read_csv("data/predictions_with_sentiment_BTC.csv")

In [12]:
pred["Predicted_price_without_sentiment"][0]

51132.4609375

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

s_size = 3 # [current,predicted_future,bitcoin_value]
a_size = 3 # [buy,hold,sell]

cpu


In [28]:
### hold = 0, sell = 1, buy = 2

class Policy(nn.Module):
    
    ### RL_glue agent functions
    
    def agent_init(self, agent_info={}):
        """Setup for the agent called when the experiment first starts."""
        self.cash = agent_info['cash'] # total cash of the agent 
        #self.model = agent_info["model"] # model for prediction
        self.crypto_name = agent_info['crypto'] #name of the considered crypto
        self.crypto_value = 0
        self.crypto_number = 0
        self.portfolio = {self.crypto_name : [self.crypto_number,self.crypto_value]}
        
        self.log_prob = []
        
        self.crypto_predicted = 0
        self.mode = "train" # change it to "test" when testing the agent
        
    def agent_start(self):
        return 0,self.cash # start by holding
    
    def agent_step(self,reward,observation):
        
        state = [observation,self.crypto_predicted,self.crypto_value]
        
        if self.mode == "train":
            action,log_prob = act(state)
            self.log_prob.append(log_prob)
            return action
        
        if self.mode == "test":
            return greedy_act(state)
    
    def agent_end(self,reward):
        #####
        return None
    
    def update_portfolio(self,action,current):
        
        if action == 1:
            self.cash +=  self.portfolio[self.crypto_name][0]*current
            self.portfolio[self.crypto_name][0] = 0
            self.portfolio[self.crypto_name][1] = 0
        if action == 2:
            if self.cash > 0:
                self.portfolio[self.crypto_name][0] += self.cash/current
                self.portfolio[self.crypto_name][1] = current
        
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__()

        # define the network layers.
        # here we have a single hidden layer with h_size neurons
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

    def forward(self, x):
        # define the forward pass
        # here we use ReLU activation for the hidden layer
        # and softmax activation for the output layer
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)
    
    def act(self, state):
        # sample an action from the policy network
        # and return the log probability of that action
        # this will be used in the loss function
        state = torch.tensor(np.array([state])).to(device)
        print(state)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

    def greedy_act(self,state):
        # takes the observation and returns the greedy action
        # this will be used to evaluate the policy
        state = torch.tensor(np.array([state]),dtype=torch.float).to(device)
        probs = self.forward(state).cpu()
        return torch.argmax(probs).item()

In [29]:
### hold = 0, sell = 1, buy = 2

def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
    # Help us to calculate the score during the training
    scores_deque = deque(maxlen=50)
    scores = []
    
    for i_episode in range(1, n_training_episodes+1):
        
        i = 1
        
        env_info = {'max' : num_obs , 'data':data}
        agent_info = {'cash':100000, 'crypto': ['BTC']}
        
        env = EnvironmentREINFORCE
        agent = policy
        
        rl_glue = RLGlue(env, agent)  # Creates a new RLGlue experiment with the env and agent we chose 
        rl_glue.rl_init(agent_info, env_info) # Pass RLGlue what it needs to initialize the agent and environment
        rl_glue.rl_start() 
                      
        # generate an episode
        saved_log_probs = []
        rewards = []
        policy = saved_log_probs
        
        
        for t in range(max_t):
                      
            i=+1
            policy.crypto_predicted = pred["Predicted_price_without_sentiment"][i]
            
            reward, obs, action, done = rl_glue.rl_step() # env_step() and agent_step() are called
            saved_log_probs = policy.log_probs # updated during agent_step
                      
            # obs = current bitcoin value
                      
            policy.update_portfolio(action,obs)
            
            if policy.crypto_number == 0:      
                reward = 0
                      
            else:
                reward = (obs-policy.crypto_value)/obs # update the nupl value
            
            rewards.append(reward)
            if done:
                break 
        
        # save the score

        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        
        returns = deque(maxlen=max_t) 
        n_steps = len(rewards) 

        # calculate the discounted returns for each step
        for t in range(n_steps)[::-1]:
            disc_return_t = (returns[0] if len(returns)>0 else 0)
            returns.appendleft( gamma*disc_return_t + rewards[t]   )    
            
        ## standardization of the returns is employed to make training more stable
        eps = np.finfo(np.float32).eps.item()
        ## eps is the smallest representable float, which is 
        # added to the standard deviation of the returns to avoid numerical instabilities        
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)
        
        # compute the loss function to be minimized
        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()
        
        # update the policy network with the backward pass
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        # print the average score every 100 episodes
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        
    return scores

In [30]:
agent_hyperparameters = {
    "h_size": 16,
    "n_training_episodes": 1000,
    "n_evaluation_episodes": 10,
    "max_t": 100,
    "gamma": 0.99,
    "lr": 1e-2,
    "env_id": "trader",
    "state_space": s_size,
    "action_space": a_size,
}

In [31]:
# Create policy and place it to the device
from collections import deque
agent_policy = Policy(agent_hyperparameters["state_space"], agent_hyperparameters["action_space"], agent_hyperparameters["h_size"]).to(device)
agent_optimizer = optim.Adam(agent_policy.parameters(), lr=agent_hyperparameters["lr"])

data = pd.read_csv("./data/all_data.csv").to_dict() # full data with price and sentiment
num_obs = max([k for k in data['Date'].keys()])

# Train the policy
scores = reinforce(agent_policy,
                   agent_optimizer,
                   agent_hyperparameters["n_training_episodes"], 
                   agent_hyperparameters["max_t"],
                   agent_hyperparameters["gamma"], 
                   20)

TypeError: forward() missing 1 required positional argument: 'x'

In [None]:
data = pd.read_csv("./data/all_data.csv").to_dict() # full data with price and sentiment
num_obs = max([k for k in data['Date'].keys()])

env = Environment
agent = Policy #Agent

env_info = {'max' : num_obs , 'data':data}
agent_info = {'model' : DecisionTreeRegressor(), 'cash':10000, 'crypto': ['BTC']}


rl_glue = RLGlue(env, agent)  # Creates a new RLGlue experiment with the env and agent we chose 
rl_glue.rl_init(agent_info, env_info) # Pass RLGlue what it needs to initialize the agent and environment
rl_glue.rl_start() 


num_steps = 11

total_nupl = []
actions = np.zeros(4)
for i in range(num_steps):
        reward, obs, action, done = rl_glue.rl_step()
        print("iteration:",i)
        print("Total NUPL:", reward)
        print("New observation:", obs[0], obs[1])
        print("Agent Portfolio:", action[0])
        print("Action taken:", action[1])

        total_nupl.append(reward)
        actions = np.vstack((actions,action[1]))
        if done:
                break