In [None]:
# !pip install tensorflow==2.6


In [None]:
# !pip install stable_baselines

In [None]:
import numpy as np
import pandas as pd
from gym.utils import seeding
import gym
from gym import spaces
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import pickle
import tensorflow
import time


In [None]:
# from stable_baselines import A2C

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy
from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
from stable_baselines.common.vec_env import DummyVecEnv


In [None]:

# shares normalization factor
# 100 shares per trade
HMAX_NORMALIZE = 100
# initial amount of money we have in our account
INITIAL_ACCOUNT_BALANCE=100000000
# total number of stocks in our portfolio
STOCK_DIM = 236
# transaction fee: 1/1000 percentage
TRANSACTION_FEE_PERCENT = 0.001
REWARD_SCALING = 1e-4

class StockEnvTrain(gym.Env):
    """A stock trading environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df,day = 0):
        #super(StockEnv, self).__init__()
        #money = 10 , scope = 1
        self.day = day
        self.df = df

        # action_space normalization and shape is STOCK_DIM
        self.action_space = spaces.Box(low = -1, high = 1,shape = (STOCK_DIM,)) 
        # Shape = 709: [Current Balance]+[prices 1-236]+[owned shares 1-236] 
        # +[feature 1-236]
        self.observation_space = spaces.Box(low=0, high=np.inf, shape = (709,))
        # load data from a pandas dataframe
        self.data = self.df.loc[self.day,:]
        self.terminal = False             
        # initalize state
        self.state = [INITIAL_ACCOUNT_BALANCE] + \
                      self.data.close.values.tolist() + \
                      [0]*STOCK_DIM + \
                      self.data.percent_good_subfund.values.tolist()
                      
        # initialize reward
        self.reward = 0
        self.cost = 0
        # memorize all the total balance change
        self.asset_memory = [INITIAL_ACCOUNT_BALANCE]
        self.rewards_memory = []
        self.trades = 0
        #self.reset()
        self._seed()


    def _sell_stock(self, index, action):
        # perform sell action based on the sign of the action
        if self.state[index+STOCK_DIM+1] > 0:
            #update balance
            self.state[0] += \
            self.state[index+1]*min(abs(action),self.state[index+STOCK_DIM+1]) * \
             (1- TRANSACTION_FEE_PERCENT)

            self.state[index+STOCK_DIM+1] -= min(abs(action), self.state[index+STOCK_DIM+1])
            self.cost +=self.state[index+1]*min(abs(action),self.state[index+STOCK_DIM+1]) * \
             TRANSACTION_FEE_PERCENT
            self.trades+=1
        else:
            pass

    
    def _buy_stock(self, index, action):
        # perform buy action based on the sign of the action
        available_amount = self.state[0] // self.state[index+1]
        # print('available_amount:{}'.format(available_amount))

        #update balance
        self.state[0] -= self.state[index+1]*min(available_amount, action)* \
                          (1+ TRANSACTION_FEE_PERCENT)

        self.state[index+STOCK_DIM+1] += min(available_amount, action)

        self.cost+=self.state[index+1]*min(available_amount, action)* \
                          TRANSACTION_FEE_PERCENT
        self.trades+=1
        
    def step(self, actions):
        # print(self.day)
        self.terminal = self.day >= len(self.df.index.unique())-1
        # print(actions)

        if self.terminal:
            plt.plot(self.asset_memory,'r')
            # plt.savefig('results/account_value_train.png')
            plt.close()
            end_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))
            
            #print("end_total_asset:{}".format(end_total_asset))
            df_total_value = pd.DataFrame(self.asset_memory)
            # df_total_value.to_csv('results/account_value_train.csv')
            #print("total_reward:{}".format(self.state[0]+sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):61]))- INITIAL_ACCOUNT_BALANCE ))
            #print("total_cost: ", self.cost)
            #print("total_trades: ", self.trades)
            df_total_value.columns = ['account_value']
            df_total_value['daily_return']=df_total_value.pct_change(1)
            sharpe = (252**0.5)*df_total_value['daily_return'].mean()/ \
                  df_total_value['daily_return'].std()
            #print("Sharpe: ",sharpe)
            #print("=================================")
            df_rewards = pd.DataFrame(self.rewards_memory)
            
            return self.state, self.reward, self.terminal,{}

        else:
            # print(np.array(self.state[1:29]))

            actions = actions * HMAX_NORMALIZE
            #actions = (actions.astype(int))
            
            begin_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))
            #print("begin_total_asset:{}".format(begin_total_asset))
            
            argsort_actions = np.argsort(actions)
            
            sell_index = argsort_actions[:np.where(actions < 0)[0].shape[0]]
            buy_index = argsort_actions[::-1][:np.where(actions > 0)[0].shape[0]]

            for index in sell_index:
                # print('take sell action'.format(actions[index]))
                self._sell_stock(index, actions[index])

            for index in buy_index:
                # print('take buy action: {}'.format(actions[index]))
                self._buy_stock(index, actions[index])

            self.day += 1
            self.data = self.df.loc[self.day,:]         
            #load next state
            print("stock_shares:{}".format(self.state[29:]))
            self.state =  [self.state[0]] + \
                    self.data.close.values.tolist() + \
                    list(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]) + \
                    self.data.percent_good_subfund.values.tolist()
                   
            
            end_total_asset = self.state[0]+ \
            sum(np.array(self.state[1:(STOCK_DIM+1)])*np.array(self.state[(STOCK_DIM+1):(STOCK_DIM*2+1)]))
            self.asset_memory.append(end_total_asset)
            print("end_total_asset:{}".format(end_total_asset))
            
            self.reward = end_total_asset - begin_total_asset            
            print("step_reward:{}".format(self.reward))
            self.rewards_memory.append(self.reward)
            
            self.reward = self.reward*REWARD_SCALING



        return self.state, self.reward, self.terminal, {}

    def reset(self):
        self.asset_memory = [INITIAL_ACCOUNT_BALANCE]
        self.day = 0
        self.data = self.df.loc[self.day,:]
        self.cost = 0
        self.trades = 0
        self.terminal = False 
        self.rewards_memory = []
        #initiate state
        self.state = [INITIAL_ACCOUNT_BALANCE] + \
                      self.data.close.values.tolist() + \
                      [0]*STOCK_DIM + \
                      self.data.percent_good_subfund.values.tolist()
                      
        # iteration += 1 
        return self.state
    
    def render(self, mode='human'):
        return self.state

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

In [None]:

def train_A2C(env_train, timesteps=10000):
    start = time.time()
    model = A2C('MlpPolicy', env_train, verbose=1)
    model.learn(total_timesteps=timesteps)
    end = time.time()
    print('Training time (A2C): ', (end-start)/60,' minutes')
    return model

In [None]:
pool_prices = pd.read_csv('/content/test_data_date_index.csv').set_index("nums")
# # pool_prices.drop(columns=["Unnamed: 0"],inplace=True)
# pool_prices[pool_prices["nums"] == 0]
# pool_prices.set_index("nums")
# pool_prices.loc[0,:]

In [None]:
merged_pools = pool_prices
merged_pools.dtypes

stock_id                 object
percent_good_subfund    float64
date                     object
close                   float64
dtype: object

In [None]:
merged_pools

Unnamed: 0_level_0,stock_id,percent_good_subfund,date,close
nums,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,001055102,2.978489,2013-09-30,30.995
0,00206R102,7.539118,2013-09-30,33.820
0,002824100,4.139970,2013-09-30,33.190
0,00287Y109,3.548546,2013-09-30,44.730
0,00507V109,0.426743,2013-09-30,16.680
...,...,...,...,...
18,98978V103,3.548546,2020-03-31,117.690
18,G1151C101,7.552870,2020-03-31,163.260
18,G29183103,2.140992,2020-03-31,77.690
18,N6596X109,1.435045,2020-03-31,82.930


In [None]:
env = StockEnvTrain(merged_pools)


In [None]:
import random
from IPython.display import clear_output
from time import sleep


def qlearning(env) :
  env.reset()
  env.render()

  #Getting the state space

  print("Action Space {}".format(env.action_space))
  print("State Space {}".format(env.observation_space))
  #Setting the hyperparameters
              
  alpha = 0.7 #learning rate                 
  discount_factor = 0.618               
  epsilon = 1                  
  max_epsilon = 1
  min_epsilon = 0.01         
  decay = 0.01

  train_episodes = 2000    
  test_episodes = 100          
  max_steps = 100
  # print(env.observation_space.shape[0])
  # print("-------")
  # print(env.action_space.shape[0])
  Q = np.zeros((env.observation_space.shape[0], env.action_space.shape[0]))
  # Creating lists to keep track of reward and epsilon values
  # print(env.observation_space)
  training_rewards = []  
  epsilons = []

  for episode in range(train_episodes):
    #Reseting the environment each time as per requirement
    state = env.reset()    
    #Starting the tracker for the rewards
    total_training_rewards = 0
    for step in range(100):
      #Choosing an action given the states based on a random number
      exp_exp_tradeoff = random.uniform(0, 1) 

      ### STEP 2: SECOND option for choosing the initial action - exploit     
      #If the random number is larger than epsilon: employing exploitation 
      #and selecting best action 
      if exp_exp_tradeoff > epsilon:
          action = np.argmax(Q[state,:]) 
          
      ### STEP 2: FIRST option for choosing the initial action - explore       
      #Otherwise, employing exploration: choosing a random action 
      else:
          action = env.action_space.sample()
      ### STEPs 3 & 4: performing the action and getting the reward     

      #Taking the action and getting the reward and outcome state
      new_state, reward, done, info = env.step(action)

      ### STEP 5: update the Q-table

      #Updating the Q-table using the Bellman equation
      print("+++++")
      print("State :")
      print(state)
      print("action :")
      print(action)
      print(" Q :")
      print(Q)
      print("----")
      Q[state, action] = Q[state, action]+alpha*(reward+discount_factor*np.max(Q[new_state, :])-Q[state, action]) 
      #Increasing our total reward and updating the state
      total_training_rewards += reward      
      state = new_state         

      #Ending the episode
      if done == True:
          #print ("Total reward for episode {}: {}".format(episode, 
          #total_training_rewards))
          break
    #Cutting down on exploration by reducing the epsilon 
    epsilon = min_epsilon+(max_epsilon-min_epsilon)*np.exp(-decay*episode)

    #Adding the total reward and reduced epsilon values
    training_rewards.append(total_training_rewards)
    epsilons.append(epsilon)
  print ("Training score over time: " + str(sum(training_rewards)/train_episodes))


In [None]:
qlearning(env)

Action Space Box(-1.0, 1.0, (236,), float32)
State Space Box(0.0, inf, (709,), float32)
stock_shares:[63.6244, 13.8, 30.19, 35.2733, 100.02, 28.5733, 37.5, 225.435, 270.62, 117.5, 11.74, 46.28, 31.75, 45.23, 23.13, 73.88, 25.74, 56.75, 37.32, 68.74, 52.15, 32.64, 83.4, 15.99, 23.97, 121.5, 23.430999999999997, 48.51, 81.72, 37.88, 41.06, 59.3, 22.5575, 39.31, 69.51, 57.4, 15.895, 14.59, 115.17, 132.87, 52.4507, 41.6353, 56.9, 81.39, 23.59, 42.64, 53.1, 64.49, 43.9793, 56.46, 62.48, 60.1562, 66.78, 98.76, 34.815, 25.55, 64.7, 86.04, 50.23, 25.135, 114.11, 18.05, 25.2625, 110.16, 16.87, 20.26, 41.63, 23.8127, 35.97, 80.89, 62.87, 25.54, 158.21, 42.75, 47.14, 51.66, 92.5, 75.85, 81.9501, 21.06, 65.25, 76.27, 38.15, 22.921, 185.18, 44.4297, 17.18, 66.31, 125.42299999999999, 51.69, 86.69, 19.86, 109.36, 58.73, 11.4, 90.2375, 35.57, 20.17, 61.1, 31.87, 99.14, 51.2, 66.42, 69.9, 50.33, 127.55, 47.61, 20.44, 32.16, 43.55, 42.06, 18.998, 67.278, 29.82, 64.7, 96.21, 47.608999999999995, 41.9433, 3

IndexError: ignored