In [1]:
%pylab inline
import numpy as np
import matplotlib.pyplot as plt
import gym
from IPython import display
import random

Populating the interactive namespace from numpy and matplotlib


In [2]:
class simpleMDP():
    # normal distribution reward
    def __init__(self, mu=-0.1, std=1, max_actions=10):
        
        self.mu = mu
        self.std = std
        self.max_actions = max_actions
        
        # Action numbers
        self.right, self.left = 0, 1
        # Define actions available for each state
        self.state_actions = {
            'A': [self.right, self.left],
            'B': [i for i in range(max_actions)],
            'C': [self.right], 
            'D': [self.left] }

        self.state_transitions = {
            'A': {self.right: 'C',
                  self.left: 'B'},
            'B': {a: 'D' for a in range(max_actions)},
            'C': {self.right: 'Done'},
            'D': {self.left: 'Done'}
        }
        
        self.state = 'A'
    
    def step(self, action):
        self.state = self.state_transitions[self.state][action]
        # reward = 0 for all transitions except from B to D
        reward = np.random.normal(self.mu, self.std) if self.state == 'D' else 0
        done = True if self.state == 'D' or self.state == 'C' else False
        return self.state, reward, done, None
    
    def available_actions(self, state=None):
        if state is None:
            return self.state_actions[self.state]
        else:
            return self.state_actions[state]
    
    def sample_actions(self):
        return np.random.choice(self.available_actions())
    
    def reset(self):
        self.state = 'A'
        return self.state

In [3]:
class uniformMDP():
    # uniform distribution reward
    def __init__(self, low=-1, high=1, max_actions=10):
        
        self.low = low
        self.high = high
        self.max_actions = max_actions
        
        # Action numbers
        self.right, self.left = 0, 1
        # Define actions available for each state
        self.state_actions = {
            'A': [self.right, self.left],
            'B': [i for i in range(max_actions)],
            'C': [self.right], 
            'D': [self.left] }

        self.state_transitions = {
            'A': {self.right: 'C',
                  self.left: 'B'},
            'B': {a: 'D' for a in range(max_actions)},
            'C': {self.right: 'Done'},
            'D': {self.left: 'Done'}
        }
        
        self.state = 'A'
    
    def step(self, action):
        self.state = self.state_transitions[self.state][action]
        # reward = 0 for all transitions except from B to D
        reward = np.random.uniform(self.low, self.high) if self.state == 'D' else 0
        done = True if self.state == 'D' or self.state == 'C' else False
        return self.state, reward, done, None
    
    def available_actions(self, state=None):
        if state is None:
            return self.state_actions[self.state]
        else:
            return self.state_actions[state]
    
    def sample_actions(self):
        return np.random.choice(self.available_actions())
    
    def reset(self):
        self.state = 'A'
        return self.state

In [4]:
!pip3 install --upgrade plotly

Requirement already up-to-date: plotly in /usr/local/lib/python3.6/dist-packages (4.13.0)


In [5]:
# Parameter
num_actions_array = 4 #### 10
rep = 3 ### 100
actions_array = 2**np.arange(1,num_actions_array+1)

# Single Q-Learning
env = simpleMDP(max_actions=actions_array[3])
max_tests = 1000
n_eps = 100 #### adjustable
eps = 0.1
lr = 0.1

def Qlearn_func(env=env, max_tests=max_tests, n_eps=n_eps, eps=eps, lr=lr):
    left_count_q = np.zeros(n_eps)
    q_estimate = np.zeros(n_eps)
    greedy_count_q = { 0: np.zeros(n_eps), 1: np.zeros(n_eps) }
    qB =np.zeros(n_eps)
    t = 0
    greedy = None
    s_1 = None
    while t < max_tests:
        Q = {state: np.zeros(env.max_actions) for state in env.state_actions.keys()}
        for ep in range(n_eps):
            s_0 = env.reset()
            while True:
                # Select eps-greedy action
                if np.random.uniform() < eps:
                    action = env.sample_actions()
                    greedy = False
                else:
                    # Break ties among max values randomly if ties exist
                    # If no ties exist, the max will be selected with prob=1
                    max_qs = np.where(
                        np.max(Q[s_0][env.available_actions()])==
                            Q[s_0][env.available_actions()])[0]
                    action = np.random.choice(max_qs)
                    greedy = True
                    
                # Counting
                if s_0 == 'A':
                    if action == 1:
                        left_count_q[ep] += 1
                        if greedy:
                            greedy_count_q[1][ep] += 1
                    else:
                        if greedy:
                            greedy_count_q[0][ep] += 1

                s_1, reward, done, _ = env.step(action)

                # Update Q-Tables
                Q[s_0][action] += lr * (reward + np.max(Q[s_1][env.available_actions()]) - 
                                        Q[s_0][action])
                s_0 = s_1
                if done:
                    q_estimate[ep] += (Q['A'][env.left] - q_estimate[ep]) / (ep + 1)
                    qB[ep] = np.max(Q['B'][env.available_actions(state='B')])
                    break
        t += 1

    return left_count_q, q_estimate, greedy_count_q, qB



In [6]:
# Double Q-Learning

def dQlearn_func(env=env, max_tests=max_tests, n_eps=n_eps, eps=eps, lr=lr):
    import copy
    left_count_dq = np.zeros(n_eps)
    q1_estimate = np.zeros(n_eps)
    q2_estimate = np.zeros(n_eps)
    dq_estimate = np.zeros(n_eps)
    greedy_count_dq = { 0: np.zeros(n_eps), 1: np.zeros(n_eps) }
    dqB = np.zeros(n_eps)
    t = 0
    greedy = None
    s_1 = None
    while t < max_tests:
        Q1 = {state: np.zeros(env.max_actions) for state in env.state_actions.keys()}
        Q2 = copy.deepcopy(Q1)
        for ep in range(n_eps):
            s_0 = env.reset()
            while True:
                # Select eps-greedy action
                if np.random.uniform() < eps or ep == 0:
                    action = env.sample_actions()
                    greedy = False
                else:
                    # If no ties exist, the max will be selected with prob=1
                    Q_sum = Q1[s_0][env.available_actions()] + \
                            Q2[s_0][env.available_actions()]
                    max_qs = np.where(np.max(Q_sum)==Q_sum)[0]
                    action = np.random.choice(max_qs)
                    greedy = True

                # Counting ####
                if s_0 == 'A':
                    if action == 1:
                        left_count_dq[ep] += 1
                        if greedy:
                            greedy_count_dq[1][ep] += 1
                    else:
                        if greedy:
                            greedy_count_dq[0][ep] += 1

                s_1, reward, done, _ = env.step(action)

                # Update Q-Tables
                if np.random.uniform() < 0.5:
                    Q1[s_0][action] += lr * (reward + \
                        Q2[s_1][np.argmax(Q1[s_1][env.state_actions[s_1]])] 
                                             - Q1[s_0][action])
                else:
                    Q2[s_0][action] += lr * (reward + \
                        Q1[s_1][np.argmax(Q2[s_1][env.state_actions[s_1]])] \
                                             - Q2[s_0][action])
                s_0 = s_1
                if done:
                    q1_estimate[ep] += (Q1['A'][env.left] - q1_estimate[ep]) / (ep + 1)
                    q2_estimate[ep] += (Q2['A'][env.left] - q2_estimate[ep]) / (ep + 1)
                    dqB[ep] = Q2['B'][np.argmax(Q1['B'][env.state_actions['B']])]
                    break
        t += 1
    dq_estimate = 0.5*(q1_estimate + q2_estimate)
    
    return left_count_dq, dq_estimate, greedy_count_dq, dqB

In [7]:
# normal distribution

# Parameter
actions_list = [2,16,64,256]# [2,4,8,16,32,64,128,256,512,1024]
actions_range = size(actions_list)
max_tests = 1000
n_eps = 40 #### adjustable
eps = 0.1
lr = 0.1
rep = 20 #### repetition

qB_mean = np.zeros((actions_range, n_eps))
dqB_mean = np.zeros((actions_range, n_eps))
qB_std = np.zeros((actions_range, n_eps))
dqB_std = np.zeros((actions_range, n_eps))

actions_idx = 0
while actions_idx < actions_range:
    # normal distribution
    env = simpleMDP(max_actions=actions_array[3])
    
    qB_pred = np.zeros((rep, n_eps))
    dqB_pred = np.zeros((rep, n_eps))
    for r in range(rep):
        qB_pred[r,:] = Qlearn_func(env=env, max_tests=max_tests, n_eps=n_eps, eps=eps, lr=lr)[3]
        dqB_pred[r,:] = dQlearn_func(env=env, max_tests=max_tests, n_eps=n_eps, eps=eps, lr=lr)[3]

    qB_mean[actions_idx,:] = np.mean(qB_pred, axis=0)
    dqB_mean[actions_idx,:] = np.mean(dqB_pred, axis=0)
    qB_std[actions_idx,:] = np.std(qB_pred, axis=0)
    dqB_std[actions_idx,:] = np.std(dqB_pred, axis=0)
    
    actions_idx += 1
    


In [8]:
import plotly.graph_objects as go
actions_list_str = ['m = '+str(x) for x in actions_list] 
V = -0.1


fig = go.Figure()
fig.add_trace(go.Bar(
    #     name=r'$\max_a Q(s_B,a) -V_*(s_B)$',
    name='Q-learning',
    marker_color='indianred',
    x=actions_list_str, y=qB_mean[:,-1]-V,
    error_y=dict(type='data', array=qB_std[:,-1])  # use the lase col of array
))
fig.add_trace(go.Bar(
    #     name=r'$Q(s_B,\arg \max_a Q(s_B,a) -V_*(s_B))$',
    name='double Q-learning',
    marker_color='lightblue',
    x=actions_list_str, y=dqB_mean[:,-1]-V,
    error_y=dict(type='data', array=dqB_std[:,-1])
))
fig.update_layout(barmode='group')


fig.update_layout(
    yaxis_title='Error',
    title='Bias of QL & dQL on i.d.d. normal distribution var',
    hovermode="x",
    paper_bgcolor = 'rgba(0,0,0,0)',
    plot_bgcolor = 'rgba(0,0,0,0)',
    font = dict(size = 16, color = 'black'),
    width = 700,
    height = 400
)
fig.update_xaxes(title = 'number of actions',showgrid=True, gridwidth=1.5, gridcolor='#DFDFDF', showline=True, linecolor = '#AFAFAF', linewidth = 2.5, nticks = 7)
fig.update_yaxes(showgrid=True, gridwidth=1.5, gridcolor='#DFDFDF', showline=True, linecolor = '#AFAFAF', linewidth = 2.5, nticks = 7)
fig.show()

In [11]:
# uniform distribution

# Parameter
actions_list = [2,16,64,256]
actions_range = size(actions_list)
max_tests = 1000
n_eps = 40 #### adjustable
eps = 0.1
lr = 0.1
rep = 5 #### repetition

qB_mean = np.zeros((actions_range, n_eps))
dqB_mean = np.zeros((actions_range, n_eps))
qB_std = np.zeros((actions_range, n_eps))
dqB_std = np.zeros((actions_range, n_eps))

actions_idx = 0
while actions_idx < actions_range:
    # uniform distribution
    env = uniformMDP(max_actions=actions_array[3])
    
    qB_pred = np.zeros((rep, n_eps))
    dqB_pred = np.zeros((rep, n_eps))
    for r in range(rep):
        qB_pred[r,:] = Qlearn_func(env=env, max_tests=max_tests, n_eps=n_eps, eps=eps, lr=lr)[3]
        dqB_pred[r,:] = dQlearn_func(env=env, max_tests=max_tests, n_eps=n_eps, eps=eps, lr=lr)[3]

    qB_mean[actions_idx,:] = np.mean(qB_pred, axis=0)
    dqB_mean[actions_idx,:] = np.mean(dqB_pred, axis=0)
    qB_std[actions_idx,:] = np.std(qB_pred, axis=0)
    dqB_std[actions_idx,:] = np.std(dqB_pred, axis=0)
    
    actions_idx += 1

In [12]:
import plotly.graph_objects as go
V = 0


fig = go.Figure()
fig.add_trace(go.Bar(
    #     name=r'$\max_a Q(s_B,a) -V_*(s_B)$',
    name='Q-learning',
    marker_color='indianred',
    x=actions_list_str, y=qB_mean[:,-1]-V,
    error_y=dict(type='data', array=qB_std[:,-1])  # use the lase col of array
))
fig.add_trace(go.Bar(
    #     name=r'$Q(s_B,\arg \max_a Q(s_B,a) -V_*(s_B))$',
    name='double Q-learning',
    marker_color='lightblue',
    x=actions_list_str, y=dqB_mean[:,-1]-V,
    error_y=dict(type='data', array=dqB_std[:,-1])
))
fig.update_layout(barmode='group')


fig.update_layout(
    yaxis_title='Error',
    title='Bias of QL & dQL on i.d.d. uniform distribution var [-1,1]',
    hovermode="x",
    paper_bgcolor = 'rgba(0,0,0,0)',
    plot_bgcolor = 'rgba(0,0,0,0)',
    font = dict(size = 16, color = 'black'),
    width = 700,
    height = 400
)
fig.update_xaxes(title = 'number of actions',showgrid=True, gridwidth=1.5, gridcolor='#DFDFDF', showline=True, linecolor = '#AFAFAF', linewidth = 2.5, nticks = 7)
fig.update_yaxes(showgrid=True, gridwidth=1.5, gridcolor='#DFDFDF', showline=True, linecolor = '#AFAFAF', linewidth = 2.5, nticks = 7)
fig.show()