In [15]:
import math
import pandas as pd
import gym
from gym import spaces, logger
from gym.utils import seeding
import numpy as np
from gym.envs.registration import register

class TradeEnv():

    """
    This class is the trading environment (render) of our project. 
    The trading agent calls the class by giving an action at the time t. 
    Then the render gives back the new portfolio at the next step (time t+1). 
    #parameters:
    - windonw_length: this is the number of time slots looked in the past to build the input tensor
    - portfolio_value: this is the initial value of the portfolio 
    - trading_cost: this is the cost (in % of the traded stocks) the agent will pay to execute the action 
    - interest_rate: this is the rate of interest (in % of the money the agent has) the agent will:
        -get at each step if he has a positive amount of money 
        -pay if he has a negative amount of money
    -train_size: % of data taken for the training of the agent - please note the training data are taken with respect 
    of the time span (train -> | time T | -> test)
    """

    def __init__(self, path = './data/stocks.csv', window=5, train_perc=0.8):
        
        self.path = path
        self.data = pd.read_csv(self.path,index_col=0)

        self.window=window
        
        self.time = self.window-1
        self.train_perc=train_perc
        self.end=False
        self.nb_asset = self.data.shape[1]
        self.train_end =  int(self.train_perc*(self.data.shape[0]-self.window))
        # state is a list with first element, previous returns on window and 2nd element is previous action
        self.state = None
        self.action=None
#         #init state and index
#         self.index = None
#         self.done = False

    def get_observation(self,t):
        return self.data.iloc[t-self.window:t,:]
    
    def reset(self, w_init, t=0 ):
        self.state= (self.get_observation(self.window) , w_init )
        self.time = self.window + t
        self.done = False
        
        return self.state, self.done

    def step(self, action):
        """
        What happens to the environment when the agent chooses an action?
        """

        time = self.time
        #get Xt from data:
        current_ret = self.get_observation( index)
        done = self.done
        
        #beginning of the day 
        w_prev = self.action
        
        #the update vector is the vector of the opening price of the day divided by the opening price of the previous day
        new_ret = self.get_ret(index)

        #allocation choice 
        w_new = action
        
        #compute instanteanous reward
        reward = np.log(w_prev.transpose()*np.exp(new_ret))
        
        #update index
        time = time+1
        
        #compute state
        
        state = (self.get_observation(time), w_new)
        
        if time >= self.train_end:
            done = True
        
        self.state = state
        self.time = time
        self.done = done
        
        return state, reward, done
        

In [21]:
tradenv=TradeEnv()
tradenv.reset(np.ones((104,1))/104)

((                                 ^GSPC        AAPL        MSFT       GOOGL  \
  2021-05-06 09:30:00-04:00  4153.779785  127.830002  245.429993  115.211754   
  2021-05-06 10:30:00-04:00  4170.970215  128.470093  247.149994  115.757492   
  2021-05-06 11:30:00-04:00  4186.209961  129.139999  248.109894  116.277855   
  2021-05-06 12:30:00-04:00  4177.790039  128.740005  247.300003  116.119995   
  2021-05-06 13:30:00-04:00  4170.049805  128.528000  247.210007  115.662003   
  
                                   AMZN       BRK-B        NVDA        META  \
  2021-05-06 09:30:00-04:00  162.928009  284.700012  142.477295  314.369995   
  2021-05-06 10:30:00-04:00  164.271500  286.470001  144.088394  316.820007   
  2021-05-06 11:30:00-04:00  165.490005  287.540009  145.558319  318.700012   
  2021-05-06 12:30:00-04:00  164.750000  286.600006  144.146255  318.720001   
  2021-05-06 13:30:00-04:00  164.341507  287.100006  142.985031  316.776398   
  
                                    JNJ 

In [22]:
tradenv.get_observation(5)

Unnamed: 0,^GSPC,AAPL,MSFT,GOOGL,AMZN,BRK-B,NVDA,META,JNJ,TSLA,...,DEO,RIO,BKNG,GILD,BLK,ADI,CVS,AMT,SCHW,NOW
2021-05-06 09:30:00-04:00,4153.779785,127.830002,245.429993,115.211754,162.928009,284.700012,142.477295,314.369995,166.904999,219.678604,...,180.259995,90.309998,2260.0,65.790001,851.097107,154.020004,82.379997,244.587494,70.169998,476.079987
2021-05-06 10:30:00-04:00,4170.970215,128.470093,247.149994,115.757492,164.2715,286.470001,144.088394,316.820007,167.050003,219.623291,...,180.610001,91.029999,2264.129883,65.790001,858.934998,155.884995,82.779999,245.399994,70.675003,479.662811
2021-05-06 11:30:00-04:00,4186.209961,129.139999,248.109894,116.277855,165.490005,287.540009,145.558319,318.700012,166.889999,223.110001,...,181.199997,91.300003,2295.495117,65.875,862.125,156.009995,82.93,245.354996,70.904999,481.464996
2021-05-06 12:30:00-04:00,4177.790039,128.740005,247.300003,116.119995,164.75,286.600006,144.146255,318.720001,166.835007,221.330002,...,181.100098,91.410004,2286.120117,65.860001,861.72998,155.104996,83.150002,244.850006,70.709999,479.040009
2021-05-06 13:30:00-04:00,4170.049805,128.528,247.210007,115.662003,164.341507,287.100006,142.985031,316.776398,167.080002,218.677902,...,181.053802,90.980003,2274.88501,65.980003,865.039978,154.029999,83.299896,243.899994,70.639999,475.016998


In [None]:
class Policy:
    def __init__(self, window=2, exploration=0.1, lr=0.2):
        self.expl = exploration
        self.q_val = {}
        self.window = window
        print(window)

    def chooseAction(self, state):
        # write state as a list
        w = 0

        if np.random.uniform() < self.expl:
            w = np.random.randint(low=0, high=11)
            if self.q_val.get(str(state)) is None:
                self.q_val[str(state)] = np.zeros(11)
        else:
            temp_val = -np.inf
            for action in range(11):
                if self.q_val.get(str(state)) is None:
                    self.q_val[str(state)] = np.zeros(11)
                else:
                    if self.q_val[str(state)][action] >= temp_val:
                        w = action
                        temp_val = self.q_val[str(state)][action]
        return w

    def update_value(self, state, action, next_state, alpha=0.1):
        next_ret = np.array([next_state[self.window - 1::self.window]]).reshape(-1, 1)
        if self.q_val.get(str(next_state)) is None:
            self.q_val[str(next_state)] = np.zeros(11)
            self.q_val[str(state)][action] += alpha * (
                        reward(action, next_ret)[0, 0] + np.max(self.q_val[str(next_state)]) - self.q_val[str(state)][
                    action])

        else:
            self.q_val[str(state)][action] += alpha * (
                        reward(action, next_ret)[0, 0] + np.max(self.q_val[str(next_state)]) - self.q_val[str(state)][
                    action])

    def train(self, returns, train_maxtime=np.inf):
        if train_maxtime >= returns.shape[1]:
            train_maxtime = returns.shape[1]
        for t in range(self.window, train_maxtime - 1):
            current_state = get_state(returns, t, self.window)
            current_action = self.chooseAction(current_state)
            next_state = get_state(returns, t + 1, self.window)
            self.update_value(state=current_state, action=current_action, next_state=next_state)

    def exploit(self, returns):
        time = returns.shape[1] - self.window - 1
        port_weight = np.zeros((returns.shape[0], time))
        for t in range(time):
            current_state = get_state(returns, self.window + t, self.window)
            act = np.argmax(self.q_val.get(str(current_state)))
            port_weight[:, t] = np.array([act / 10., 1 - act / 10.])

        return port_weight

    def savePolicy(self):
        with open('policy', 'wb') as fw:
            pickle.dump(self.states_values, fw)

    def loadPolicy(self, name_file):
        with open(name_file, 'rb') as fw:
            self.states_values = pickle.load(fw)
