In [1]:
import gym 
import itertools 
import matplotlib 
import matplotlib.style 
import numpy as np 
import math
import pandas as pd 
import sys 
import os
  
from collections import defaultdict 
#import plotting 
  
matplotlib.style.use('ggplot') 

In [2]:
gym.make('bs_env-v0')

UnregisteredEnv: No registered env with id: bs_env-v0

In [5]:
class BSEnv:
    '''A simple discrete time BS environment'''
    
    def __init__(self, mu, sigma, r, T, delta_t, V_0=100): #S_0=100
        #self.S_0     = S_0      # initial stock price
        self.mu      = mu       # expected stock return
        self.sigma   = sigma    # stock standard deviation
        self.r       = r        # risk-free interest rate
        self.T       = T        # investment horizon
        self.delta_t = delta_t  # time-step size
        self.V_0     = V_0
        self.action_space = np.array([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])   #investment in stock
        self.n_actions = len(self.action_space)
        
        self.t      = 0         # time
        self.V_t    = V_0       # initial wealth
        self.state  = (0, V_0)  # initial state
        self.reward = 0         # initial reward
        
        self.wealth_bins = None
        
        
    #def sample_trajectory(self):
    #    returns      = np.random.normal(loc=self.delta_t * self.mu, scale=math.sqrt(self.delta_t) * self.sigma, size=math.ceil(self.T / self.delta_t))
    #    returns      = np.insert(returns, 0, 0)
    #    stock_prices = self.S_0 * np.cumprod(1+returns)
    #    return(stock_prices, returns)
    
    
    def rewards_from_prices(self, stock_prices, utility = "log"):
        rewards = np.zeros_like(stock_prices)        # all rewards until last time-step are zero
        if utility == "log":
            rewards[-1] = np.log(stock_prices[-1])   # last reward R_T = U(S_T)
        else:
            raise ValueError\
            ('utility function must be one of the following: log')
        return(rewards)
    
    
    def reset(self):
        '''Resets the environment to the state (t=0, V_0)'''
        
        self.t = 0
        self.state = (0, self.V_0)
        self.reward = 0
        return(self.state)
    
    
    def step(self, action):
        '''Computes one step in BS Environment.'''
        
        self.t    += self.delta_t                   # Update time
        self.V_t  *= action * (np.random.normal(loc=self.delta_t * self.mu, scale=math.sqrt(self.delta_t) * self.sigma) - self.r) + (1 + self.r)  # Update Wealth (see notes)
        #self.state = (self.t, self.V_t)             # Update state
        self.state = self.get_state(50, 150, 5)
        
        reward = np.log(self.V_t)*(self.t==self.T)  # Get reward according to log utility at terminal time step
        done   = self.t==self.T                    # End of investment period
        return(self.state, reward, done, self.t, self.V_t)
    
    
    def get_state(self, lower, upper, delta_bin):
        '''Computes the discrete state (t, V_t), for continuous t'''
        
        if self.wealth_bins is None:
            # Create bins (0, lower] < (lower, lower + delta_bin] < (lower + delta_bin, lower + 2*delta_bin] < ... < (upper, inf]
            self.wealth_bins = [0] + np.arange(lower, upper, delta_bin).tolist() + [float('Inf')] 
            
        return((self.t, pd.cut(x=[self.V_t], bins=self.wealth_bins, labels=False)))

In [6]:
def createEpsilonGreedyPolicy(Q, epsilon, num_actions): 
    """ 
    Creates an epsilon-greedy policy based 
    on a given Q-function and epsilon. 
       
    Returns a function that takes the state 
    as an input and returns the probabilities 
    for each action in the form of a numpy array  
    of length of the action space(set of possible actions). 
    """
    def policyFunction(state): 
   
        Action_probabilities = np.ones(num_actions, 
                dtype = float) * epsilon / num_actions 
                  
        best_action = np.argmax(Q[state]) 
        Action_probabilities[best_action] += (1.0 - epsilon) 
        return Action_probabilities 
   
    return policyFunction

In [7]:
def qLearning(env, num_episodes, discount_factor = 1.0, 
                            alpha = 0.6, epsilon = 0.1): 
    """ 
    Q-Learning algorithm: Off-policy TD control. 
    Finds the optimal greedy policy while improving 
    following an epsilon-greedy policy"""
       
    # Action value function 
    # A nested dictionary that maps 
    # state -> (action -> action-value). 
    Q = defaultdict(lambda: np.zeros(env.n_actions)) 
   
    # Keeps track of useful statistics 
    #stats = plotting.EpisodeStats( 
    #    episode_lengths = np.zeros(num_episodes), 
    #    episode_rewards = np.zeros(num_episodes))     
      
    # Create an epsilon greedy policy function 
    # appropriately for environment action space 
    policy = createEpsilonGreedyPolicy(Q, epsilon, env.n_actions) 
       
    # For every episode 
    for ith_episode in range(num_episodes): 
           
        # Reset the environment and pick the first action 
        state = env.reset() 
           
        for t in itertools.count(): 
               
            # get probabilities of all actions from current state 
            action_probabilities = policy(state) 
   
            # choose action according to  
            # the probability distribution 
            action = np.random.choice(env.action_space, 
                                      p = action_probabilities) 
   
            # take action and get reward, transit to next state 
            next_state, reward, done, _, _ = env.step(action) 
   
            # Update statistics 
            #stats.episode_rewards[ith_episode] += reward 
            #stats.episode_lengths[ith_episode] = t 
               
            # TD Update 
            best_next_action = np.argmax(Q[next_state])     
            td_target = reward + discount_factor * Q[next_state][best_next_action] 
            td_delta = td_target - Q[state][action] 
            Q[state][action] += alpha * td_delta 
   
            # done is True if episode terminated    
            if done: 
                break
                   
            state = next_state 
       
    return Q, stats

In [8]:
env = BSEnv(mu=0.05, sigma=0.2, r=0.01, T=5, delta_t=0.5, V_0=100)

Q, stats = qLearning(env, 10) 

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [3]:
#import gym
from gym import spaces

class BSEnv(gym.Env):
    '''Custom discrete-time Black-Scholes environment'''
    metadata = {'render.modes': ['human']}
    
    def __init__(self, mu, sigma):
        super().__init__()
        self.mu    = mu
        self.sigma = sigma
        
        # Actions
        