In [1]:
# Import libraries
import gym
from envs.binomial_tree2 import BinomialTree, decode_action    # custom BinomialTree dynamics
from envs import plotting
import numpy as np
import pandas as pd
import math
from collections import defaultdict
import itertools
import matplotlib.pyplot as plt

In [2]:
actions = np.arange(0, 1.01, step=0.1)                  #vector of actions, discrete investment decisions in 10% steps
#actions = np.array([0, 13/19, 1])
print("Actions (Investment in risky asset):", actions)  
lower = 90                                              # upper limit of lowest wealth bin [0, lower)
upper = 110                                             # lower limit of highest wealth bin [upper, +Inf)
delta_bin = 20                                          # wealth-bin width
wealth_bins = [0] + np.arange(lower, upper+1, delta_bin).tolist() + [float('Inf')]  # +1 as upper limit is not included

Actions (Investment in risky asset): [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]


In [None]:
# Simulation of BinomialTree dynmics
print(env.reset())
print(env.V_t)
print(env.step(0))
print(env.V_t)
print(env.step(10))
print(env.V_t)
print(env.step(10))
print(env.V_t)
print(env.step(10))
print(env.V_t)

**Epsilon-Greedy Policy**\
Source: https://www.geeksforgeeks.org/q-learning-in-python/#:~:text=Q%2DLearning%20is%20a%20basic,defined%20for%20states%20and%20actions.

In [3]:
def createEpsilonGreedyPolicy(Q, epsilon, num_actions): 
    """ 
    Creates an epsilon-greedy policy based 
    on a given Q-function and epsilon. 
       
    Returns a function that takes the state 
    as an input and returns the probabilities 
    for each action in the form of a numpy array  
    of length of the action space(set of possible actions). 
    """
    def policyFunction(state): 
   
        Action_probabilities = np.ones(num_actions, dtype = float) * epsilon / num_actions 
        best_action = np.argmax(Q[state]) 
        Action_probabilities[best_action] += (1.0 - epsilon) 
        return Action_probabilities 
   
    return policyFunction

**Q-Learning Algorithm** \
Source: https://www.geeksforgeeks.org/q-learning-in-python/#:~:text=Q%2DLearning%20is%20a%20basic,defined%20for%20states%20and%20actions.

In [4]:
def qLearning(env, num_episodes, discount_factor = 1, alpha = 0.1, epsilon = 1): 
    """ 
    Q-Learning algorithm: Off-policy TD control. 
    Finds the optimal greedy policy while improving 
    following an epsilon-greedy policy"""
       
    # Action value function 
    # A nested dictionary that maps 
    # state -> (action -> action-value). 
    Q = defaultdict(lambda: np.zeros(env.action_space.n)) 
    A = defaultdict(lambda: np.zeros(env.action_space.n))
   
    # Keeps track of useful statistics 
    stats = plotting.EpisodeStats( 
        episode_lengths = np.zeros(num_episodes), 
        episode_rewards = np.zeros(num_episodes))     
       
    # Create an epsilon greedy policy function 
    # appropriately for environment action space 
    policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n) 
       
    # For every episode
    returns=np.array([])
    return_ = 0
    terminal_wealths = np.array([])
    for ith_episode in range(num_episodes): 
           
        # Reset the environment and pick the first action 
        state = env.reset() 
           
        for t in itertools.count(): 
               
            # get probabilities of all actions from current state 
            action_probabilities = policy(state)
   
            # choose action according to  
            # the probability distribution 
            action = np.random.choice(np.arange( 
                      len(action_probabilities)), 
                       p = action_probabilities)
            A[state][action] += 1
   
            # take action and get reward, transit to next state 
            next_state, reward, done, _ = env.step(action)
   
            # Update statistics 
            stats.episode_rewards[ith_episode] += reward 
            stats.episode_lengths[ith_episode] = t
               
            # TD Update 
            best_next_action = np.argmax(Q[next_state])     
            td_target = reward + discount_factor * Q[next_state][best_next_action] 
            td_delta = td_target - Q[state][action] 
            Q[state][action] += (1/A[state][action]) * td_delta      # Dynamic Step-Size (1/k) as in Sutton/Barto p.53
   
            return_ += reward
            # done is True if episode terminated    
            if done:
                returns = np.append(returns, return_)
                terminal_wealths=np.append(terminal_wealths, env.V_t)
                return_= 0
                break
                   
            state = next_state
        
        if (ith_episode % 10000 == 0):            
            print("Episode: {}, Mean Return: {}, Mean Wealth (V_T): {}, Epsilon: {}, Alpha: {}".format(ith_episode, round(returns.mean(), 3), round(terminal_wealths.mean(), 3), epsilon, alpha))
            #print("td_delta:", td_delta)
            #print(Q[(0,1)])
            #print("Best Action (Investment in risky asset):", decode_action(np.argmax(Q[(0,1)]), actions))
            returns = np.array([])
            terminal_wealths=np.array([])
            
        #if (ith_episode % 100000 == 0):
        #    #alpha = 0.0001
        #    epsilon *= 0.7
        #    alpha *= 0.95
        #    policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n)
            
        # Epsilon-Decay    
        #if (ith_episode % 1000 == 0) & (ith_episode != 0):
        #    epsilon *= 0.95
        #    policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n)
        #    alpha = 0.1
        
        # Alpha-Decay
        #if (ith_episode % 20000 == 0) & (ith_episode != 0):
        #    if alpha > 0.00011:
        #        alpha *= 1/10
       
    return Q, stats, A

In [7]:
# Square root utility function
#up_prob, up_ret, down_ret, r, T, dt, V_0, actions, wealth_bins, utility
env = BinomialTree(up_prob=4/9, up_ret=1, down_ret=-1/2, r=0, T=2, dt=1, V_0=100, actions=actions, wealth_bins=wealth_bins, utility="sqrt")
Q, stats, A = qLearning(env, 2000000)

Episode: 0, Mean Return: 15.492, Mean Wealth (V_T): 240.0, Epsilon: 1, Alpha: 0.1
Episode: 10000, Mean Return: 10.423, Mean Wealth (V_T): 118.126, Epsilon: 1, Alpha: 0.1
Episode: 20000, Mean Return: 10.437, Mean Wealth (V_T): 118.288, Epsilon: 1, Alpha: 0.1
Episode: 30000, Mean Return: 10.394, Mean Wealth (V_T): 117.22, Epsilon: 1, Alpha: 0.1
Episode: 40000, Mean Return: 10.424, Mean Wealth (V_T): 118.108, Epsilon: 1, Alpha: 0.1
Episode: 50000, Mean Return: 10.364, Mean Wealth (V_T): 116.581, Epsilon: 1, Alpha: 0.1
Episode: 60000, Mean Return: 10.398, Mean Wealth (V_T): 117.509, Epsilon: 1, Alpha: 0.1
Episode: 70000, Mean Return: 10.374, Mean Wealth (V_T): 117.028, Epsilon: 1, Alpha: 0.1
Episode: 80000, Mean Return: 10.391, Mean Wealth (V_T): 117.276, Epsilon: 1, Alpha: 0.1
Episode: 90000, Mean Return: 10.404, Mean Wealth (V_T): 117.514, Epsilon: 1, Alpha: 0.1
Episode: 100000, Mean Return: 10.366, Mean Wealth (V_T): 116.696, Epsilon: 1, Alpha: 0.1
Episode: 110000, Mean Return: 10.431, 

Episode: 930000, Mean Return: 10.411, Mean Wealth (V_T): 117.643, Epsilon: 1, Alpha: 0.1
Episode: 940000, Mean Return: 10.376, Mean Wealth (V_T): 116.95, Epsilon: 1, Alpha: 0.1
Episode: 950000, Mean Return: 10.417, Mean Wealth (V_T): 117.836, Epsilon: 1, Alpha: 0.1
Episode: 960000, Mean Return: 10.355, Mean Wealth (V_T): 116.686, Epsilon: 1, Alpha: 0.1
Episode: 970000, Mean Return: 10.406, Mean Wealth (V_T): 117.613, Epsilon: 1, Alpha: 0.1
Episode: 980000, Mean Return: 10.382, Mean Wealth (V_T): 116.977, Epsilon: 1, Alpha: 0.1
Episode: 990000, Mean Return: 10.407, Mean Wealth (V_T): 117.809, Epsilon: 1, Alpha: 0.1
Episode: 1000000, Mean Return: 10.385, Mean Wealth (V_T): 117.081, Epsilon: 1, Alpha: 0.1
Episode: 1010000, Mean Return: 10.459, Mean Wealth (V_T): 118.935, Epsilon: 1, Alpha: 0.1
Episode: 1020000, Mean Return: 10.414, Mean Wealth (V_T): 117.73, Epsilon: 1, Alpha: 0.1
Episode: 1030000, Mean Return: 10.36, Mean Wealth (V_T): 116.488, Epsilon: 1, Alpha: 0.1
Episode: 1040000, Me

Episode: 1850000, Mean Return: 10.375, Mean Wealth (V_T): 116.989, Epsilon: 1, Alpha: 0.1
Episode: 1860000, Mean Return: 10.346, Mean Wealth (V_T): 116.289, Epsilon: 1, Alpha: 0.1
Episode: 1870000, Mean Return: 10.393, Mean Wealth (V_T): 117.074, Epsilon: 1, Alpha: 0.1
Episode: 1880000, Mean Return: 10.419, Mean Wealth (V_T): 118.054, Epsilon: 1, Alpha: 0.1
Episode: 1890000, Mean Return: 10.382, Mean Wealth (V_T): 117.038, Epsilon: 1, Alpha: 0.1
Episode: 1900000, Mean Return: 10.396, Mean Wealth (V_T): 117.327, Epsilon: 1, Alpha: 0.1
Episode: 1910000, Mean Return: 10.356, Mean Wealth (V_T): 116.577, Epsilon: 1, Alpha: 0.1
Episode: 1920000, Mean Return: 10.44, Mean Wealth (V_T): 118.352, Epsilon: 1, Alpha: 0.1
Episode: 1930000, Mean Return: 10.363, Mean Wealth (V_T): 116.72, Epsilon: 1, Alpha: 0.1
Episode: 1940000, Mean Return: 10.368, Mean Wealth (V_T): 116.703, Epsilon: 1, Alpha: 0.1
Episode: 1950000, Mean Return: 10.447, Mean Wealth (V_T): 118.556, Epsilon: 1, Alpha: 0.1
Episode: 196

In [None]:
plotting.plot_episode_stats(stats) 

In [8]:
for key in Q.keys():
    print("Key:", key)
    print("State-Action Values:", Q[key], sep="\n")
    print("Best Action (Investment in risky asset):", decode_action(np.argmax(Q[key]), actions))

Key: (0, 1)
State-Action Values:
[10.28801086 10.40540089 10.46763096 10.48338078 10.51960108 10.55002203
 10.56278248 10.56787654 10.55888306 10.54400638 10.52184846]
Best Action (Investment in risky asset): 0.7000000000000001
Key: (1, 2)
State-Action Values:
[10.         10.09607912 10.17139263 10.23699466 10.29150737 10.31086663
 10.31390193 10.35880872 10.32129773 10.32270294 10.25538626]
Best Action (Investment in risky asset): 0.7000000000000001
Key: (2, 2)
State-Action Values:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Best Action (Investment in risky asset): 0.0
Key: (1, 0)
State-Action Values:
[10.         10.06233617 10.11227755 10.15905146 10.18777369 10.20913961
 10.22473103 10.22286828 10.21062927 10.17626262 10.17510878]
Best Action (Investment in risky asset): 0.6000000000000001
Key: (2, 0)
State-Action Values:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Best Action (Investment in risky asset): 0.0
Key: (1, 1)
State-Action Values:
[10.         10.07295485 10.13506459 10.18389907 10.21496

In [None]:
# How often was each action take in each state
for key in A:
    print("State: {}, Actions: {}".format(key, A[key]))

In [None]:
def true_avf(x):
    return(4/9 * math.sqrt(2*x*100 + (1-x)*100) + 5/9 * math.sqrt(0.5*x*100 + (1-x)*100))

true_Q = np.array([true_avf(x) for x in actions])
print(true_Q)

In [None]:
learning_rates = np.array([0.0001, 0.001, 0.01, 0.1, 0.2, 0.5])
num_episodes   = np.array([100, 1000, 2000, 5000, 10000, 50000, 100000])

Q_values = np.zeros((len(num_episodes), len(learning_rates)))
for i in range(len(num_episodes)):
    for j in range(len(learning_rates)):
        episodes = num_episodes[i]
        alpha = learning_rates[j]
        print(episodes, alpha)
        Q, _, _ = qLearning(env, num_episodes = episodes, discount_factor = 1, alpha = alpha, epsilon = 1)
        Q_values[i][j] = np.sqrt(np.sum((Q[(0,1)]-true_Q) ** 2))

In [None]:
plt.plot(num_episodes, Q_values.transpose()[0], label="Alpha="+str(learning_rates[0]))
plt.plot(num_episodes, Q_values.transpose()[1], label="Alpha="+str(learning_rates[1]))
plt.plot(num_episodes, Q_values.transpose()[2], label="Alpha="+str(learning_rates[2]))
plt.plot(num_episodes, Q_values.transpose()[3], label="Alpha="+str(learning_rates[3]))
plt.plot(num_episodes, Q_values.transpose()[4], label="Alpha="+str(learning_rates[4]))
plt.plot(num_episodes, Q_values.transpose()[5], label="Alpha="+str(learning_rates[5]))
plt.xlabel("Episodes")
plt.ylabel("RMS error to true value function")
plt.legend()
plt.title("Convergence of Q-values for different constant step sizes")
plt.savefig("testplot.jpg")
plt.show()