In [17]:
import numpy as np
import pandas as pd
import math 

In [44]:
class BSEnv:
    '''A simple discrete time BS environment'''
    
    def __init__(self, mu, sigma, r, T, delta_t, V_0=100): #S_0=100
        #self.S_0     = S_0      # initial stock price
        self.mu      = mu       # expected stock return
        self.sigma   = sigma    # stock standard deviation
        self.r       = r        # risk-free interest rate
        self.T       = T        # investment horizon
        self.delta_t = delta_t  # time-step size
        self.action_space = np.array([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])   #investment in stock
        self.n_actions = len(self.action_space)
        
        self.t      = 0         # time
        self.V_t    = V_0       # initial wealth
        self.state  = (0, V_0)  # initial state
        self.reward = 0         # initial reward
        
        self.wealth_bins = None
        
        
    #def sample_trajectory(self):
    #    returns      = np.random.normal(loc=self.delta_t * self.mu, scale=math.sqrt(self.delta_t) * self.sigma, size=math.ceil(self.T / self.delta_t))
    #    returns      = np.insert(returns, 0, 0)
    #    stock_prices = self.S_0 * np.cumprod(1+returns)
    #    return(stock_prices, returns)
    
    
    def rewards_from_prices(self, stock_prices, utility = "log"):
        rewards = np.zeros_like(stock_prices)        # all rewards until last time-step are zero
        if utility == "log":
            rewards[-1] = np.log(stock_prices[-1])   # last reward R_T = U(S_T)
        else:
            raise ValueError\
            ('utility function must be one of the following: log')
        return(rewards)
    
    
    def reset(self):
        '''Resets the environment to the state (t=0, V_0)'''
        
        self.t = 0
        self.state = (0, V_0)
        self.reward = 0
        return(self.state)
    
    
    def step(self, action):
        '''Computes one step in BS Environment.'''
        
        self.t    += self.delta_t                   # Update time
        self.V_t  *= action * (np.random.normal(loc=self.delta_t * self.mu, scale=math.sqrt(self.delta_t) * self.sigma) - self.r) + (1 + self.r)  # Update Wealth (see notes)
        #self.state = (self.t, self.V_t)             # Update state
        self.state = self.get_state(50, 150, 5)
        
        reward = np.log(self.V_t)*(self.t==self.T)  # Get reward according to log utility at terminal time step
        done   = self.t==self.T                    # End of investment period
        return(self.state, reward, done, self.t, self.V_t)
    
    
    def get_state(self, lower, upper, delta_bin):
        '''Computes the discrete state (t, V_t), for continuous t'''
        
        if self.wealth_bins is None:
            # Create bins (0, lower] < (lower, lower + delta_bin] < (lower + delta_bin, lower + 2*delta_bin] < ... < (upper, inf]
            self.wealth_bins = [0] + np.arange(lower, upper, delta_bin).tolist() + [float('Inf')] 
            
        return((self.t, pd.cut(x=[self.V_t], bins=self.wealth_bins, retbins=False)))
        

In [45]:
model = BSEnv(mu=0.05, sigma=0.2, r=0.01, T=5, delta_t=0.5, V_0=100)

#stock_prices, returns = model.sample_trajectory()
#rewards = model.rewards_from_prices(stock_prices, utility = "log")

#print(returns)
#print(stock_prices)
#print(rewards)
#print(len(returns), len(stock_prices), len(rewards))
print(model.action_space)
new_state, reward, done, next_t, next_Wealth = model.step(0.1)
print(new_state, reward, done, next_t, next_Wealth)
new_state, reward, done, next_t, next_Wealth = model.step(0.1)
print(new_state, reward, done, next_t, next_Wealth)
new_state, reward, done, next_t, next_Wealth = model.step(0.1)
print(new_state, reward, done, next_t, next_Wealth)
new_state, reward, done, next_t, next_Wealth = model.step(0.1)
print(new_state, reward, done, next_t, next_Wealth)
new_state, reward, done, next_t, next_Wealth = model.step(0.1)
print(new_state, reward, done, next_t, next_Wealth)
new_state, reward, done, next_t, next_Wealth = model.step(0.1)
print(new_state, reward, done, next_t, next_Wealth)
new_state, reward, done, next_t, next_Wealth = model.step(0.1)
print(new_state, reward, done, next_t, next_Wealth)
new_state, reward, done, next_t, next_Wealth = model.step(0.1)
print(new_state, reward, done, next_t, next_Wealth)
new_state, reward, done, next_t, next_Wealth = model.step(0.1)
print(new_state, reward, done, next_t, next_Wealth)
new_state, reward, done, next_t, next_Wealth = model.step(0.1)
print(new_state, reward, done, next_t, next_Wealth)
model.get_state(50, 150, 20)

[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
(0.5, [(100.0, 105.0]]
Categories (21, interval[float64]): [(0.0, 50.0] < (50.0, 55.0] < (55.0, 60.0] < (60.0, 65.0] ... (130.0, 135.0] < (135.0, 140.0] < (140.0, 145.0] < (145.0, inf]]) 0.0 False 0.5 103.53372785878672
(1.0, [(100.0, 105.0]]
Categories (21, interval[float64]): [(0.0, 50.0] < (50.0, 55.0] < (55.0, 60.0] < (60.0, 65.0] ... (130.0, 135.0] < (135.0, 140.0] < (140.0, 145.0] < (145.0, inf]]) 0.0 False 1.0 103.34773306267283
(1.5, [(105.0, 110.0]]
Categories (21, interval[float64]): [(0.0, 50.0] < (50.0, 55.0] < (55.0, 60.0] < (60.0, 65.0] ... (130.0, 135.0] < (135.0, 140.0] < (140.0, 145.0] < (145.0, inf]]) 0.0 False 1.5 105.65285294497387
(2.0, [(105.0, 110.0]]
Categories (21, interval[float64]): [(0.0, 50.0] < (50.0, 55.0] < (55.0, 60.0] < (60.0, 65.0] ... (130.0, 135.0] < (135.0, 140.0] < (140.0, 145.0] < (145.0, inf]]) 0.0 False 2.0 105.00599658844611
(2.5, [(105.0, 110.0]]
Categories (21, interval[float64]): [(0.0, 50.0] < 

(5.0,
 [(110.0, 115.0]]
 Categories (21, interval[float64]): [(0.0, 50.0] < (50.0, 55.0] < (55.0, 60.0] < (60.0, 65.0] ... (130.0, 135.0] < (135.0, 140.0] < (140.0, 145.0] < (145.0, inf]])

In [51]:
#pd.cut(x = df['height'],
#                        bins = [0,25,50,100,200], 
#                        labels = [0, 1, 2,3])
x = pd.cut(x=[15], bins=[0] + np.arange(11, 17, 2).tolist() + [float('Inf')], retbins=False, labels=False)
x.mean()

2.0