In [115]:
import gym
import numpy as np
from gym import spaces
import pandas as pd
import math
from collections import defaultdict
import itertools

In [79]:
actions = np.arange(0, 1, step=0.1)   #vector of actions

def encode_action(action, actions):
    return(int(np.where(action == actions)[0][0]))

def decode_action(action, actions):
    return(actions[action])

lower = 50
upper = 150
delta_bin = 10
wealth_bins = [0] + np.arange(lower, upper+1, delta_bin).tolist() + [float('Inf')]  # +1 as upper limit is not included

def encode_wealth(wealth, wealth_bins):
    return(pd.cut(x=[wealth], bins=wealth_bins, right=False, labels=False)[0])

#def decode_wealth(wealth):
    #To-Do

In [99]:
class BSEnv(gym.Env):
    '''Custom discrete-time Black-Scholes environment with one risky-asset and bank account'''
    metadata = {'render.modes': ['human']}
    
    def __init__(self, mu, sigma, r, T, dt, V_0, actions, wealth_bins):
        assert divmod(T, dt)[1] == 0        # To-Do: change to ValueError, is T 'ganzzahlig' divisible
        super().__init__()
        self.mu    = mu                        # risky asset return
        self.sigma = sigma                     # risky asset volatility
        self.r = r                             # risk-free rate (bank account return, riskless)
        self.T = T                             # Termination time
        self.dt = dt                           # time-step size
        self.num_timesteps = T//dt
        self.V_0 = V_0                         # Initial wealth
        self.actions = actions                 # possible actions, fraction of wealth invested in risky aset
        self.num_actions = len(self.actions)   # number of possible actions
        self.wealth_bins = wealth_bins
        
        self.reset()
        
        # Action space
        self.action_space = spaces.Discrete(self.num_actions)
        
        # Observation space
        self.observation_space = spaces.Tuple((
            spaces.Discrete(self.num_timesteps),
            spaces.Discrete(len(self.wealth_bins))))
        
    
    def step(self, action):
        '''Execute one time step within the environment'''
        assert self.action_space.contains(action)
        
        pi_t = decode_action(action, self.actions)
        self.V_t *= pi_t * (np.random.normal(loc=self.dt * self.mu, scale=math.sqrt(self.dt) * self.sigma) - self.r) + (1 + self.r)  # Update Wealth (see notes)
        self.time_state += 1      # updating time-step
        #self.wealth_state = encode_wealth(self.V_t, self.wealth_bins)
        
        done = self.time_state == self.num_timesteps           # Episode is finished if termination time is reached
        
        reward = 0                                        # Reward is zero for each time step t<T
        if done:                                          # Reward at termination time R_T = U(V_T)
            reward = np.log(self.V_t)
            
        return self._get_obs(), reward, done, {}          # {} empty info
    
    def _get_obs(self):
        return (self.time_state, encode_wealth(self.V_t, self.wealth_bins))
            
    def reset(self):
        '''Reset the state of the environment to an initial state'''
        self.time_state   = 0                                          # setting time to zero
        self.V_t          = self.V_0                                   # setting wealth to V_0
        #self.wealth_state = encode_wealth(self.V_t, self.wealth_bins)  # encoding wealth V_0 
        return self._get_obs()
    
    def encode_wealth(wealth, wealth_bins):
        return(pd.cut(x=[wealth], bins=wealth_bins, right=False, labels=False)[0])
    
    def encode_action(action, actions):
        return(int(np.where(action == actions)[0][0]))

    def decode_action(action, actions):
        return(actions[action])
    
    
    #def render(self, mode='human', close=False):
    # Render the environment to the screen
    #...

In [110]:
#mu, sigma, r, T, dt, V_0, actions, wealth_bins
env = BSEnv(0.05, 0.2, 0.02, 3, 0.5, 100, actions, wealth_bins)
#env.reset()
#for i in range(6):
#    obs, reward, done, _ = env.step(1)
#    print("Obervation: {}, Reward: {}, done: {}".format(obs, reward, done))

In [105]:
def createEpsilonGreedyPolicy(Q, epsilon, num_actions): 
    """ 
    Creates an epsilon-greedy policy based 
    on a given Q-function and epsilon. 
       
    Returns a function that takes the state 
    as an input and returns the probabilities 
    for each action in the form of a numpy array  
    of length of the action space(set of possible actions). 
    """
    def policyFunction(state): 
   
        Action_probabilities = np.ones(num_actions, 
                dtype = float) * epsilon / num_actions 
                  
        best_action = np.argmax(Q[state]) 
        Action_probabilities[best_action] += (1.0 - epsilon) 
        return Action_probabilities 
   
    return policyFunction

In [117]:
def qLearning(env, num_episodes, discount_factor = 1.0, 
                            alpha = 0.6, epsilon = 0.1): 
    """ 
    Q-Learning algorithm: Off-policy TD control. 
    Finds the optimal greedy policy while improving 
    following an epsilon-greedy policy"""
       
    # Action value function 
    # A nested dictionary that maps 
    # state -> (action -> action-value). 
    Q = defaultdict(lambda: np.zeros(env.action_space.n)) 
   
    # Keeps track of useful statistics 
    #stats = plotting.EpisodeStats( 
    #    episode_lengths = np.zeros(num_episodes), 
    #    episode_rewards = np.zeros(num_episodes))     
       
    # Create an epsilon greedy policy function 
    # appropriately for environment action space 
    policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n) 
       
    # For every episode 
    for ith_episode in range(num_episodes): 
           
        # Reset the environment and pick the first action 
        state = env.reset() 
           
        for t in itertools.count(): 
               
            # get probabilities of all actions from current state 
            action_probabilities = policy(state) 
   
            # choose action according to  
            # the probability distribution 
            action = np.random.choice(np.arange( 
                      len(action_probabilities)), 
                       p = action_probabilities) 
   
            # take action and get reward, transit to next state 
            next_state, reward, done, _ = env.step(action) 
   
            # Update statistics 
            #stats.episode_rewards[ith_episode] += reward 
            #stats.episode_lengths[ith_episode] = t 
               
            # TD Update 
            best_next_action = np.argmax(Q[next_state])     
            td_target = reward + discount_factor * Q[next_state][best_next_action] 
            td_delta = td_target - Q[state][action] 
            Q[state][action] += alpha * td_delta 
   
            # done is True if episode terminated    
            if done: 
                break
                   
            state = next_state
        if ith_episode % 200 == 0:
            print("Reward: {}, t: {}".format(reward, t))
       
    return Q

In [120]:
Q = qLearning(env, 100000) 

Reward: 4.7239859497651695, t: 5
Reward: 4.795516789290654, t: 5
Reward: 4.702699988237905, t: 5
Reward: 4.611031167523755, t: 5
Reward: 4.757220200773751, t: 5
Reward: 4.699582053241323, t: 5
Reward: 4.738902176267547, t: 5
Reward: 4.7812903223930165, t: 5
Reward: 4.715845632053241, t: 5
Reward: 4.556965044419086, t: 5
Reward: 4.767112362230764, t: 5
Reward: 4.723871972850001, t: 5
Reward: 4.683251433601296, t: 5
Reward: 4.573640693960389, t: 5
Reward: 4.7892672046683, t: 5
Reward: 4.687038128355931, t: 5
Reward: 4.716887369666401, t: 5
Reward: 4.622459675882178, t: 5
Reward: 4.514317648294419, t: 5
Reward: 4.611867981516854, t: 5
Reward: 4.756981372645218, t: 5
Reward: 4.724573105646938, t: 5
Reward: 4.71490486311909, t: 5
Reward: 4.781699465095075, t: 5
Reward: 4.802540455679874, t: 5
Reward: 4.557906472036233, t: 5
Reward: 4.685927398884709, t: 5
Reward: 4.712947752675249, t: 5
Reward: 4.800415663223337, t: 5
Reward: 4.84645672172559, t: 5
Reward: 5.007094908730904, t: 5
Reward: 4.

Reward: 4.753165406045028, t: 5
Reward: 4.675287910024676, t: 5
Reward: 4.657958165488885, t: 5
Reward: 4.5440726583109585, t: 5
Reward: 4.780968813832541, t: 5
Reward: 4.817064040486878, t: 5
Reward: 4.746206693146908, t: 5
Reward: 4.704180882332294, t: 5
Reward: 4.907973265776219, t: 5
Reward: 4.695818136954492, t: 5
Reward: 4.569973731429625, t: 5
Reward: 4.527201910425855, t: 5
Reward: 4.783383778744569, t: 5
Reward: 4.636701238883855, t: 5
Reward: 4.733612304230479, t: 5
Reward: 4.7296386088535805, t: 5
Reward: 4.854347626306562, t: 5
Reward: 4.841464478999371, t: 5
Reward: 4.666545611552916, t: 5
Reward: 4.719076603469391, t: 5
Reward: 4.777672869957437, t: 5
Reward: 4.483830230552921, t: 5
Reward: 4.771210343289764, t: 5
Reward: 4.7491019563697465, t: 5
Reward: 4.721814848208827, t: 5
Reward: 4.752568109988717, t: 5
Reward: 4.743983477155992, t: 5
Reward: 4.765512609701129, t: 5
Reward: 4.830458713655379, t: 5
Reward: 4.782863785909363, t: 5
Reward: 4.673241016312691, t: 5
Rewar