# Testing the Quantized Decentralized Multi-Agent Q-Learning Algorithm

In [1]:
import itertools
import numpy as np
from tqdm.auto import tqdm, trange

import sys, os
sys.path.append(os.path.dirname(os.getcwd())) 

from quantization import DiscreteSpace, ContinuousInterval, UniformQuantizer, NullQuantizer, QuantizedPolicy, TransitionKernel
from cts_multi_agent_learning import quantized_q_learning_alg

In [2]:
POS_REWARD_CONST = 100

## Trivial Single-Agent 'Finite' State and Action
A simple MDP with a single state and two actions, one good one bad.

In [3]:
state_space = DiscreteSpace([0]) # single state
action_space = DiscreteSpace([0, 1]) # two actions

state_quantizer = NullQuantizer(state_space)
action_quantizer = NullQuantizer(action_space)

q_policy = QuantizedPolicy(state_quantizer, action_quantizer, index_policy='random_init', exploration_prob=1)
quantized_agent_policies = [q_policy]

def transition_func(x, us):
    return 0

transition_kernel = TransitionKernel(state_space, [action_space], transition_func)

def reward_func(x, us):
    u = us[0]
    if u==0:
        return 0
    elif u==1:
        return POS_REWARD_CONST
    else:
        raise ValueError('received action outside expected range')

reward_funcs = [reward_func]

betas = [0.5]

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 1

deltas = [1e-4]

inertias = [0]

get_initial_state = lambda: 0


In [4]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [5]:
Qs

[array([[ 98.99311272, 198.99176731]])]

In [6]:
quantized_agent_policies[0].get_policy_map()

{0: 1}

## Slightly Less Trivial Single-Agent 'Finite' State and Action Spaces

An MDP with a 'good' state and a 'bad' state, and two actions each of which causes a deterministic transition to one of the states.

In [7]:
state_space = DiscreteSpace(['good', 'bad']) # two state
action_space = DiscreteSpace(['a', 'b']) # two actions

state_quantizer = NullQuantizer(state_space)
action_quantizer = NullQuantizer(action_space)

q_policy = QuantizedPolicy(state_quantizer, action_quantizer, index_policy='random_init', exploration_prob=1)
quantized_agent_policies = [q_policy]

def transition_func(x, us):
    u = us[0]
    if u == 'a':
        return 'good'
    elif u=='b':
        return 'bad'
    else:
        raise ValueError('received unexpected action')

transition_kernel = TransitionKernel(state_space, [action_space], transition_func)

def reward_func(x, us):
    if x=='good':
        return POS_REWARD_CONST
    elif x=='bad':
        return 0
    else:
        raise ValueError('received state/action outside expected range')

reward_funcs = [reward_func]

betas = [0.25]

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 1

deltas = [1e-4]

inertias = [0]

get_initial_state = lambda: 'good'

In [8]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [9]:
Qs

[array([[133.26780462, 108.27329238],
        [ 33.26850317,   8.27629886]])]

In [10]:
quantized_agent_policies[0].get_policy_map()

{'good': 'a', 'bad': 'a'}

## 2-agent Team (coordinate actions), 'Finite' State and action space

A 2-agent generalization of the above where transition to the good state occurs if the agents coordinate (play the same action) and transition to the bad state occurs otherwise.

In [11]:
state_space = DiscreteSpace(['good', 'bad']) # two state
action_space1 = DiscreteSpace(['a', 'b']) # two actions
action_space2 = DiscreteSpace(['a', 'b']) # two actions

state_quantizer1 = NullQuantizer(state_space)
state_quantizer2 = NullQuantizer(state_space)

action_quantizer1 = NullQuantizer(action_space1)
action_quantizer2 = NullQuantizer(action_space2)

q_policy1 = QuantizedPolicy(state_quantizer1, action_quantizer1, index_policy='random_init', exploration_prob=0.25)
q_policy2 = QuantizedPolicy(state_quantizer2, action_quantizer2, index_policy='random_init', exploration_prob=0.25)

quantized_agent_policies = [q_policy1, q_policy2]

def transition_func(x, us):
    u1, u2 = us
    
    if u1 == u2:
        return 'good'
    elif u1 != u2:
        return 'bad'
    else:
        raise ValueError('received unexpected action')

transition_kernel = TransitionKernel(state_space, [action_space1, action_space2], transition_func)

def reward_func(x, us):
    if x=='good':
        return POS_REWARD_CONST
    elif x=='bad':
        return 0
    else:
        raise ValueError('received state/action outside expected range')

reward_funcs = [reward_func, reward_func]

betas = [0.25]*2

T = int(5e4)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 5

deltas = [1e-4]*2

inertias = [0.1]*2

get_initial_state = lambda: 'good'

In [12]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))




In [13]:
Qs

[array([[118.2894217 , 110.94677187],
        [  7.75999469,  29.12091001]]),
 array([[129.12874693, 110.25558867],
        [ 29.04347357,  10.44216313]])]

In [14]:
quantized_agent_policies[0].get_policy_map(), quantized_agent_policies[1].get_policy_map()

({'good': 'a', 'bad': 'b'}, {'good': 'a', 'bad': 'a'})

## Continuous State-Space Single-Agent
A continuous space MDP over $\mathbb{X} = [0,1]$ and $\mathbb{U} = \{-1, 1\}$ where actions either transition the state forward or backward. A larger state value corresponds to higher reward.

In [15]:
state_space = ContinuousInterval(0, 1) # single state
action_space = DiscreteSpace([-1, 1]) # two actions

state_quantizer = UniformQuantizer(state_space, n_bins=10)
action_quantizer = NullQuantizer(action_space)

q_policy = QuantizedPolicy(state_quantizer, action_quantizer, index_policy='random_init', exploration_prob=0.5)
quantized_agent_policies = [q_policy]

def transition_func(x, us):
    u = us[0]
    x_next = np.clip(x + 0.1*u, 0, 1)
    return x_next

transition_kernel = TransitionKernel(state_space, [action_space], transition_func)

def reward_func(x, us):
    return x*POS_REWARD_CONST # reward high state values

reward_funcs = [reward_func]

betas = [0.25]

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 5

deltas = [1e-4]

inertias = [0]

get_initial_state = lambda: 0

In [16]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [17]:
Qs

[array([[  0.        ,   0.41666667],
        [  5.        ,   9.50751016],
        [ 14.37934028,  22.82129757],
        [ 27.36261215,  39.37083271],
        [ 44.06385965,  58.55879826],
        [ 75.70454055,  80.91144362],
        [ 89.8528176 ,  97.54923891],
        [104.18466483, 110.75914418],
        [117.59525473, 123.285889  ],
        [130.77709813, 133.28939707]])]

In [18]:
quantized_agent_policies[0].get_policy_map()

{0.0: 1,
 0.1111111111111111: 1,
 0.2222222222222222: 1,
 0.3333333333333333: 1,
 0.4444444444444444: 1,
 0.5555555555555556: 1,
 0.6666666666666666: 1,
 0.7777777777777777: 1,
 0.8888888888888888: 1,
 1.0: 1}

## Continuous State-Space Single-Agent

Similar to the above except the cost measures the distance from $x = 0.5$.

In [19]:
state_space = ContinuousInterval(0, 1) # single state
action_space = DiscreteSpace([-1, 1]) # two actions

state_quantizer = UniformQuantizer(state_space, n_bins=10)
action_quantizer = NullQuantizer(action_space)

q_policy = QuantizedPolicy(state_quantizer, action_quantizer, index_policy='random_init', exploration_prob=1)
quantized_agent_policies = [q_policy]

def transition_func(x, us):
    u = us[0]
    x_next = np.clip(x + 0.1*u, 0, 1)
    return x_next

transition_kernel = TransitionKernel(state_space, [action_space], transition_func)

def reward_func(x, us):
    return -POS_REWARD_CONST * np.abs(x - 0.5) # reward state values near 0.5

reward_funcs = [reward_func]

betas = [0.25]

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 1

deltas = [1e-4]

inertias = [0]

get_initial_state = lambda: 0

In [20]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [21]:
Qs

[array([[-65.46418463, -62.15435759],
        [-55.47090179, -48.83929498],
        [-42.15814533, -35.51487853],
        [-28.8443483 , -22.16282302],
        [-11.06777203,  -8.82974077],
        [ -8.8943845 , -11.14661909],
        [-22.22302084, -28.83774459],
        [-35.5277977 , -42.14743418],
        [-48.83763128, -55.46163846],
        [-62.14984392, -65.4495664 ]])]

In [22]:
quantized_agent_policies[0].get_policy_map()

{0.0: 1,
 0.1111111111111111: 1,
 0.2222222222222222: 1,
 0.3333333333333333: 1,
 0.4444444444444444: 1,
 0.5555555555555556: -1,
 0.6666666666666666: -1,
 0.7777777777777777: -1,
 0.8888888888888888: -1,
 1.0: -1}

## Continuous State-Space Single-Agent

Similar to the above except the transitions aren't deterministic anymore and have a zero-mean gaussian noise.

In [23]:
state_space = ContinuousInterval(0, 1) # single state
action_space = DiscreteSpace([-1, 1]) # two actions

state_quantizer = UniformQuantizer(state_space, n_bins=10)
action_quantizer = NullQuantizer(action_space)

q_policy = QuantizedPolicy(state_quantizer, action_quantizer, index_policy='random_init', exploration_prob=1)
quantized_agent_policies = [q_policy]

def transition_func(x, us):
    u = us[0]
    x_next_ = x + 0.1*u + np.random.normal(0, 0.05)
    x_next = np.clip(x_next_, 0, 1)
    return x_next

transition_kernel = TransitionKernel(state_space, [action_space], transition_func)

def reward_func(x, us):
    return -POS_REWARD_CONST * np.abs(x - 0.5) # reward state values near 0.5

reward_funcs = [reward_func]

betas = [0.25]

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 1

deltas = [1e-4]

inertias = [0]

get_initial_state = lambda: 0

In [24]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [25]:
Qs

[array([[-64.25987612, -60.85763855],
        [-52.99587546, -47.15328278],
        [-39.14474641, -33.13096662],
        [-24.5770633 , -19.22437353],
        [-10.16424514,  -7.94833485],
        [ -7.86499704, -10.18670733],
        [-19.26458286, -24.69953497],
        [-32.93313893, -39.13755279],
        [-47.23366227, -52.98356422],
        [-60.87010505, -64.21527874]])]

In [26]:
quantized_agent_policies[0].get_policy_map()

{0.0: 1,
 0.1111111111111111: 1,
 0.2222222222222222: 1,
 0.3333333333333333: 1,
 0.4444444444444444: 1,
 0.5555555555555556: -1,
 0.6666666666666666: -1,
 0.7777777777777777: -1,
 0.8888888888888888: -1,
 1.0: -1}

## Continuous State-Space Two-Agent

A 2-agent with $\mathbb{X} = [0,1]$ and $\mathbb{U}^1 = \mathbb{U}^2 = \{-1, 1\}$. Agents go forward by $0.1$ if the coordinate (play the same action), and go backwards by $0.1$ if they don't coordinate. The reward function rewards states closer to $1$.

In [27]:
state_space = ContinuousInterval(0, 1) # single state
action_space1 = DiscreteSpace([-1, 1]) # two actions
action_space2 = DiscreteSpace([-1, 1]) # two actions

state_quantizer1 = UniformQuantizer(state_space, n_bins=5)
state_quantizer2 = UniformQuantizer(state_space, n_bins=5)

action_quantizer1 = NullQuantizer(action_space1)
action_quantizer2 = NullQuantizer(action_space2)

q_policy1 = QuantizedPolicy(state_quantizer1, action_quantizer1, index_policy='random_init', exploration_prob=0.2)
q_policy2 = QuantizedPolicy(state_quantizer2, action_quantizer2, index_policy='random_init', exploration_prob=0.2)

quantized_agent_policies = [q_policy1, q_policy2]

def transition_func(x, us):
    u1, u2 = us
    d = u1 * u2 # forward if coordinating, backwards otherwise
    x_next = np.clip(x + 0.1*d, 0, 1)
    return x_next

transition_kernel = TransitionKernel(state_space, [action_space1, action_space2], transition_func)

def reward_func(x, us):
    return POS_REWARD_CONST * x # reward high state values

reward_funcs = [reward_func, reward_func]

betas = [0.75]*2

T = int(1e5)

alpha_func = lambda n: 1/(n + 1)

n_exploration_phases = 10

deltas = [1e-4]*2

inertias = [0.25, 0.75]

get_initial_state = lambda: 0

In [28]:
Qs, quantized_agent_policies, history = quantized_q_learning_alg(quantized_agent_policies, transition_kernel, get_initial_state,
                reward_funcs, betas, T, alpha_func, n_exploration_phases, deltas, inertias, early_stopping=False, verbose=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [29]:
Qs

[array([[  4.37711946,   3.82312053],
        [ 18.75      ,  21.015625  ],
        [119.16926372, 268.92358012],
        [247.49739128, 335.08580762],
        [289.93759143, 374.87854579]]),
 array([[  4.48580665,   4.4800425 ],
        [  0.        ,  24.375     ],
        [259.76402871, 289.87736404],
        [318.07480712, 341.7692545 ],
        [364.62319351, 367.57987777]])]

In [30]:
quantized_agent_policies[0].get_policy_map(), quantized_agent_policies[1].get_policy_map()

({0.0: -1, 0.25: 1, 0.5: 1, 0.75: 1, 1.0: 1},
 {0.0: -1, 0.25: 1, 0.5: 1, 0.75: 1, 1.0: 1})