In [14]:
ReloadProject('deep_learning')

notebook_init.py imported and reloaded
forwarded symbol: Activation
forwarded symbol: Dense
forwarded symbol: Dict
forwarded symbol: InputLayer
forwarded symbol: List
forwarded symbol: Model
forwarded symbol: Sequential
forwarded symbol: Tuple
reloaded: gym
forwarded symbol: gym
reloaded: keras
forwarded symbol: keras
reloaded: model_optimization
forwarded symbol: model_optimization
reloaded: openai_wrapper
forwarded symbol: openai_wrapper
reloaded: policy_impl
forwarded symbol: policy_impl
reloaded: q_function_memoization
forwarded symbol: q_function_memoization
reloaded: q_learning
forwarded symbol: q_learning
reloaded: q_learning_impl
forwarded symbol: q_learning_impl
reloaded: q_learning_impl_v2
forwarded symbol: q_learning_impl_v2
reloaded: q_learning_v2
forwarded symbol: q_learning_v2


## Environment Setup
Let's assume a world with 11 states: 0-10. Each time the agent and move +1 or -1, with 0-1 -> 10 and 10+1 -> 0. All actions that gets the agent closer to state "5" gets reward +1, otherwise gets reward -1.

In [18]:
STATE_ZERO_ARRAY = np.zeros(1, dtype=int)
TARGET_STATE = 5


class CircularWorld(q_learning_v2.Environment):
    
    def __init__(self):
        super().__init__(state_array_size=1, action_space_size=3)

        self.debug_verbosity = 0
        
        # action encoding
        self._action_minus = 0
        self._action_stay = 1
        self._action_plus = 2

        
    #@ Override
    def TakeAction(self, action: q_learning_v2.Action) -> q_learning_v2.Reward:
        current_state = self.GetState()
        new_state = current_state
        reward = 0
        if action == self._action_plus:
            if current_state < TARGET_STATE:
                reward = 1.0
            else:
                reward = -1.0
            new_state = current_state + 1
            if new_state == 11:
                new_state = STATE_ZERO_ARRAY
        elif action == self._action_minus:
            if current_state > TARGET_STATE:
                reward = 1.0
            else:
                reward = -1.0
            new_state = current_state - 1
            if new_state == -1:
                new_state = STATE_ZERO_ARRAY + 10
        else:
            if current_state != TARGET_STATE:
                reward = -1.0
            else:
                reward = 1.0

        self._protected_SetState(new_state)
        if self.debug_verbosity >= 1:
            print('Action %s: (%s) -> (%s), reward: %s' % (
                action, current_state, new_state, reward))
        return reward

Let's try out the environment.

In [3]:
env = CircularWorld()
env.debug_verbosity = 10
for _ in range(20):
    env.TakeAction(np.random.choice(env.GetActionSpace()))

Action 0: ([0.]) -> ([10]), reward: -1.0
Action 0: ([10]) -> ([9]), reward: 1.0
Action 0: ([9]) -> ([8]), reward: 1.0
Action 2: ([8]) -> ([9]), reward: -1.0
Action 1: ([9]) -> ([9]), reward: -1.0
Action 1: ([9]) -> ([9]), reward: -1.0
Action 2: ([9]) -> ([10]), reward: -1.0
Action 1: ([10]) -> ([10]), reward: -1.0
Action 2: ([10]) -> ([0]), reward: -1.0
Action 2: ([0]) -> ([1]), reward: 1.0
Action 2: ([1]) -> ([2]), reward: 1.0
Action 1: ([2]) -> ([2]), reward: -1.0
Action 2: ([2]) -> ([3]), reward: 1.0
Action 2: ([3]) -> ([4]), reward: 1.0
Action 2: ([4]) -> ([5]), reward: 1.0
Action 1: ([5]) -> ([5]), reward: 1.0
Action 1: ([5]) -> ([5]), reward: 1.0
Action 2: ([5]) -> ([6]), reward: -1.0
Action 2: ([6]) -> ([7]), reward: -1.0
Action 1: ([7]) -> ([7]), reward: -1.0


## Learning

### Single model approach

In [37]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.KerasModelQFunction(
    env, (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Training.
for _ in range(4000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)
    
# Testing.
env.debug_verbosity = 5
qfunc.SetDebugVerbosity(5)
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([3], 0) -> [[-0.20680887]]
GET: ([3], 1) -> [[-0.20680887]]
GET: ([3], 2) -> [[0.8173575]]
<use random choice>
Action 0: ([3]) -> ([2]), reward: -1.0
GET: ([2], 0) -> [[-0.20680887]]
GET: ([2], 1) -> [[0.10594125]]
GET: ([2], 2) -> [[1.6265708]]
GET: ([3], 0) -> [[-0.20680887]]
SET: ([3], 0) <- [[0.39684144]]
GET: ([2], 0) -> [[-0.18448368]]
GET: ([2], 1) -> [[0.13334492]]
GET: ([2], 2) -> [[1.6477598]]
<use random choice>
Action 1: ([2]) -> ([2]), reward: -1.0
GET: ([2], 0) -> [[-0.18448368]]
GET: ([2], 1) -> [[0.13334492]]
GET: ([2], 2) -> [[1.6477598]]
GET: ([2], 1) -> [[0.13334492]]
SET: ([2], 1) <- [[0.44801992]]
GET: ([2], 0) -> [[-0.16990179]]
GET: ([2], 1) -> [[0.3333642]]
GET: ([2], 2) -> [[1.7631308]]
Action 2: ([2]) -> ([3]), reward: 1.0
GET: ([3], 0) -> [[-0.16990179]]
GET: ([3], 1) -> [[-0.16990179]]
GET: ([3], 2) -> [[0.97960705]]
GET: ([2], 2) -> [[1.7631308]]
SET: ([2], 2) <- [[1.8697947]]
GET: ([3], 0) -> [[-0.16578543]]
GET: ([3], 1) -> [[-0.16578543]]
GET: ([3]

#### Conclusion

This approach fails for model with structure (6, 6, 6) and for model with structure (20, 20, 20), probably because changing the value for one action also changes the values for other actions, making it either long to train the model or other tricks might be needed.

### Multi-head model approach
Next let's try to use a multi-head model. In this case each action has its own model and we only update the weights in the model the policy picked to avoid changing weight for the models for other actions. The models share some common layers to possibly support common features.

In [35]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
for _ in range(4000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.788313]]
GET: ([3], 2) -> [[1.7116532]]
Action 1: ([3]) -> ([3]), reward: -1.0
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.788313]]
GET: ([3], 2) -> [[1.7116532]]
GET: ([3], 1) -> [[1.788313]]
SET: ([3], 1) <- [[0.7273648]]
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.767094]]
GET: ([3], 2) -> [[1.7116532]]
Action 1: ([3]) -> ([3]), reward: -1.0
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.767094]]
GET: ([3], 2) -> [[1.7116532]]
GET: ([3], 1) -> [[1.767094]]
SET: ([3], 1) <- [[0.70805556]]
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.7459133]]
GET: ([3], 2) -> [[1.7116532]]
Action 1: ([3]) -> ([3]), reward: -1.0
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.7459133]]
GET: ([3], 2) -> [[1.7116532]]
GET: ([3], 1) -> [[1.7459133]]
SET: ([3], 1) <- [[0.688781]]
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.7247707]]
GET: ([3], 2) -> [[1.7116532]]
Action 1: ([3]) -> ([3]), reward: -1.0
GET: (

Note: this approach does not work when the towers have the "wrong" shape, like (3, 3, 3), which is not a good model to fit a sophisticated function. See "keras_function_fit_discountinous" notebook.

In [39]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (200,), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
for _ in range(4000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([5], 0) -> [[-0.27712864]]
GET: ([5], 1) -> [[-0.49954152]]
GET: ([5], 2) -> [[-1.7212956]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.27712864]]
GET: ([4], 1) -> [[-0.4995423]]
GET: ([4], 2) -> [[-0.10966682]]
GET: ([5], 0) -> [[-0.27712864]]
SET: ([5], 0) <- [[-1.016543]]
GET: ([4], 0) -> [[-0.2965436]]
GET: ([4], 1) -> [[-0.4995454]]
GET: ([4], 2) -> [[-0.10965633]]
Action 2: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-0.2965436]]
GET: ([5], 1) -> [[-0.49954462]]
GET: ([5], 2) -> [[-1.7212946]]
GET: ([4], 2) -> [[-0.10965633]]
SET: ([4], 2) <- [[0.64883405]]
GET: ([5], 0) -> [[-0.2965575]]
GET: ([5], 1) -> [[-0.4997524]]
GET: ([5], 2) -> [[-0.35255474]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.2965575]]
GET: ([4], 1) -> [[-0.49973]]
GET: ([4], 2) -> [[0.76266855]]
GET: ([5], 0) -> [[-0.2965575]]
SET: ([5], 0) <- [[-0.31189424]]
GET: ([4], 0) -> [[-0.29696116]]
GET: ([4], 1) -> [[-0.49973005]]
GET: ([4], 2) -> [[0.76266855]]
Action 2: (

In [40]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValuePolicy()

# Train.
for _ in range(4000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([4], 0) -> [[-1.1329795]]
GET: ([4], 1) -> [[-2.8374398]]
GET: ([4], 2) -> [[-0.87958753]]
Action 2: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-1.1795522]]
GET: ([5], 1) -> [[-3.3028107]]
GET: ([5], 2) -> [[-0.87958753]]
GET: ([4], 2) -> [[-0.87958753]]
SET: ([4], 2) <- [[0.09957534]]
GET: ([5], 0) -> [[-1.1795522]]
GET: ([5], 1) -> [[-3.3028107]]
GET: ([5], 2) -> [[-0.86000425]]
Action 2: ([5]) -> ([6]), reward: -1.0
GET: ([6], 0) -> [[-1.226125]]
GET: ([6], 1) -> [[-3.768182]]
GET: ([6], 2) -> [[-0.86000425]]
GET: ([5], 2) -> [[-0.86000425]]
SET: ([5], 2) <- [[-1.6826037]]
GET: ([6], 0) -> [[-1.226125]]
GET: ([6], 1) -> [[-3.768182]]
GET: ([6], 2) -> [[-0.87645626]]
Action 2: ([6]) -> ([7]), reward: -1.0
GET: ([7], 0) -> [[-1.2726977]]
GET: ([7], 1) -> [[-4.2335534]]
GET: ([7], 2) -> [[-0.87645626]]
GET: ([6], 2) -> [[-0.87645626]]
SET: ([6], 2) <- [[-1.6975752]]
GET: ([7], 0) -> [[-1.2726977]]
GET: ([7], 1) -> [[-4.2335534]]
GET: ([7], 2) -> [[-0.89287865]]
Action 2: ([7]

In [44]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (40, 40, 40), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.RandomActionPolicy()

# Train.
for _ in range(10000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy = q_learning_impl_v2.MaxValuePolicy()
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([10], 0) -> [[5.6680374]]
GET: ([10], 1) -> [[3.3310022]]
GET: ([10], 2) -> [[3.7245948]]
Action 0: ([10]) -> ([9]), reward: 1.0
GET: ([9], 0) -> [[5.1856256]]
GET: ([9], 1) -> [[3.3310022]]
GET: ([9], 2) -> [[3.7245948]]
GET: ([10], 0) -> [[5.6680374]]
SET: ([10], 0) <- [[5.6671605]]
GET: ([9], 0) -> [[5.183048]]
GET: ([9], 1) -> [[3.3310022]]
GET: ([9], 2) -> [[3.7245948]]
Action 0: ([9]) -> ([8]), reward: 1.0
GET: ([8], 0) -> [[4.6989355]]
GET: ([8], 1) -> [[3.3310022]]
GET: ([8], 2) -> [[3.7245948]]
GET: ([9], 0) -> [[5.183048]]
SET: ([9], 0) <- [[5.2244425]]
GET: ([8], 0) -> [[4.795692]]
GET: ([8], 1) -> [[3.3310022]]
GET: ([8], 2) -> [[3.7245948]]
Action 0: ([8]) -> ([7]), reward: 1.0
GET: ([7], 0) -> [[4.2973275]]
GET: ([7], 1) -> [[3.3310022]]
GET: ([7], 2) -> [[3.7245948]]
GET: ([8], 0) -> [[4.795692]]
SET: ([8], 0) <- [[4.860404]]
GET: ([7], 0) -> [[4.420882]]
GET: ([7], 1) -> [[3.3310022]]
GET: ([7], 2) -> [[3.7245948]]
Action 0: ([7]) -> ([6]), reward: 1.0
GET: ([6], 

In [45]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
for _ in range(4000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([6], 0) -> [[-0.27988243]]
GET: ([6], 1) -> [[-1.3076097]]
GET: ([6], 2) -> [[-1.069]]
Action 0: ([6]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-1.4874269]]
GET: ([5], 1) -> [[-1.2247055]]
GET: ([5], 2) -> [[-1.069]]
GET: ([6], 0) -> [[-0.27988243]]
SET: ([6], 0) <- [[0.00612178]]
GET: ([5], 0) -> [[-1.049413]]
GET: ([5], 1) -> [[-1.2247055]]
GET: ([5], 2) -> [[-1.069]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-1.4944894]]
GET: ([4], 1) -> [[-1.1398618]]
GET: ([4], 2) -> [[-1.069]]
GET: ([5], 0) -> [[-1.049413]]
SET: ([5], 0) <- [[-1.8708313]]
GET: ([4], 0) -> [[-1.5116377]]
GET: ([4], 1) -> [[-1.1398618]]
GET: ([4], 2) -> [[-1.069]]
Action 2: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-1.5229793]]
GET: ([5], 1) -> [[-1.2247055]]
GET: ([5], 2) -> [[-1.069]]
GET: ([4], 2) -> [[-1.069]]
SET: ([4], 2) <- [[-0.07278997]]
GET: ([5], 0) -> [[-1.5229793]]
GET: ([5], 1) -> [[-1.2247055]]
GET: ([5], 2) -> [[-1.0490758]]
<use random choice>
Action 0: ([5]) -> ([4]), re

In [46]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
for _ in range(200000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.7842224]]
GET: ([5], 2) -> [[1.5472215]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.7842224]]
GET: ([5], 2) -> [[1.5472215]]
GET: ([5], 1) -> [[3.7842224]]
SET: ([5], 1) <- [[4.3436418]]
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.7954106]]
GET: ([5], 2) -> [[1.5472215]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.7954106]]
GET: ([5], 2) -> [[1.5472215]]
GET: ([5], 1) -> [[3.7954106]]
SET: ([5], 1) <- [[4.3538237]]
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.8065789]]
GET: ([5], 2) -> [[1.5472215]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.8065789]]
GET: ([5], 2) -> [[1.5472215]]
GET: ([5], 1) -> [[3.8065789]]
SET: ([5], 1) <- [[4.3639865]]
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.817727]]
GET: ([5], 2) -> [[1.5472215]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0

In [47]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.RandomActionPolicy()

# Train.
for _ in range(400000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy = q_learning_impl_v2.MaxValuePolicy()
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([6], 0) -> [[nan]]
GET: ([6], 1) -> [[nan]]
GET: ([6], 2) -> [[nan]]
Action None: ([6]) -> ([6]), reward: -1.0
GET: ([6], 0) -> [[nan]]
GET: ([6], 1) -> [[nan]]
GET: ([6], 2) -> [[nan]]


TypeError: tuple indices must be integers or slices, not NoneType

In [50]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
for _ in range(10000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

GET: ([5], 0) -> [[1.1370602]]
GET: ([5], 1) -> [[1.7409459]]
GET: ([5], 2) -> [[-0.00569913]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[1.1370602]]
GET: ([5], 1) -> [[1.7409459]]
GET: ([5], 2) -> [[-0.00569913]]
GET: ([5], 1) -> [[1.7409459]]
SET: ([5], 1) <- [[2.5626035]]
GET: ([5], 0) -> [[1.1370602]]
GET: ([5], 1) -> [[1.757379]]
GET: ([5], 2) -> [[-0.00569913]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[1.1370602]]
GET: ([5], 1) -> [[1.757379]]
GET: ([5], 2) -> [[-0.00569913]]
GET: ([5], 1) -> [[1.757379]]
SET: ([5], 1) <- [[2.578297]]
GET: ([5], 0) -> [[1.1370602]]
GET: ([5], 1) -> [[1.7737974]]
GET: ([5], 2) -> [[-0.00569913]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[1.1370602]]
GET: ([5], 1) -> [[1.7737974]]
GET: ([5], 2) -> [[-0.00569913]]
GET: ([5], 1) -> [[1.7737974]]
SET: ([5], 1) <- [[2.5939765]]
GET: ([5], 0) -> [[1.1370602]]
GET: ([5], 1) -> [[1.790201]]
GET: ([5], 2) -> [[-0.00569913]]
Action 1: ([5]) -> ([5]), reward: 1.0
GE

In [51]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.8)

# Train.
for _ in range(10000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

GET: ([2], 0) -> [[-1.8807478]]
GET: ([2], 1) -> [[-2.562942]]
GET: ([2], 2) -> [[-1.8433678]]
Action 2: ([2]) -> ([3]), reward: 1.0
GET: ([3], 0) -> [[-1.8807478]]
GET: ([3], 1) -> [[-2.465203]]
GET: ([3], 2) -> [[-1.8433678]]
GET: ([2], 2) -> [[-1.8433678]]
SET: ([2], 2) <- [[-0.8604162]]
GET: ([3], 0) -> [[-1.8807478]]
GET: ([3], 1) -> [[-2.465203]]
GET: ([3], 2) -> [[-1.8237088]]
Action 2: ([3]) -> ([4]), reward: 1.0
GET: ([4], 0) -> [[-1.8807478]]
GET: ([4], 1) -> [[-2.4477901]]
GET: ([4], 2) -> [[-1.8237088]]
GET: ([3], 2) -> [[-1.8237088]]
SET: ([3], 2) <- [[-0.8416419]]
GET: ([4], 0) -> [[-1.8807478]]
GET: ([4], 1) -> [[-2.4477901]]
GET: ([4], 2) -> [[-1.8040675]]
Action 2: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-1.8807478]]
GET: ([5], 1) -> [[-2.4818978]]
GET: ([5], 2) -> [[-1.8040675]]
GET: ([4], 2) -> [[-1.8040675]]
SET: ([4], 2) <- [[-0.82288444]]
GET: ([5], 0) -> [[-1.8807478]]
GET: ([5], 1) -> [[-2.4818978]]
GET: ([5], 2) -> [[-1.7844439]]
Action 2: ([5]) -> ([6])

In [59]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 40, 20), learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=100000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=30, debug_verbosity=10)

GET: ([1], 0) -> [[-1.8553413]]
GET: ([1], 1) -> [[-2.5736475]]
GET: ([1], 2) -> [[-2.1607256]]
Action 0: ([1]) -> ([0]), reward: -1.0
GET: ([0], 0) -> [[-1.8553413]]
GET: ([0], 1) -> [[-2.5736475]]
GET: ([0], 2) -> [[-2.1607256]]
GET: ([1], 0) -> [[-1.8553413]]
SET: ([1], 0) <- [[-2.671851]]
GET: ([0], 0) -> [[-1.8716716]]
GET: ([0], 1) -> [[-2.5736475]]
GET: ([0], 2) -> [[-2.1607256]]
Action 0: ([0]) -> ([10]), reward: -1.0
GET: ([10], 0) -> [[-1.8716716]]
GET: ([10], 1) -> [[-2.5736475]]
GET: ([10], 2) -> [[-2.1607256]]
GET: ([0], 0) -> [[-1.8716716]]
SET: ([0], 0) <- [[-2.6874464]]
GET: ([10], 0) -> [[-1.887987]]
GET: ([10], 1) -> [[-2.5736475]]
GET: ([10], 2) -> [[-2.1607256]]
Action 0: ([10]) -> ([9]), reward: 1.0
GET: ([9], 0) -> [[-1.887987]]
GET: ([9], 1) -> [[-2.5736475]]
GET: ([9], 2) -> [[-2.1607256]]
GET: ([10], 0) -> [[-1.887987]]
SET: ([10], 0) <- [[-0.9030276]]
GET: ([9], 0) -> [[-1.8682878]]
GET: ([9], 1) -> [[-2.5736475]]
GET: ([9], 2) -> [[-2.1607256]]
Action 0: ([9]

#### Conclusion

This model only works when using model of structure (20, 20, 20) with MaxValueWithRandomnessPolicy, with about 20% chance. It does not work when:
* A shared layer of size 200 is added.
* When using MaxValuePolicy.

### Memoization Q-Function

Before giving up (or looking up others' answers), try to solve it using memoization which should work.

In [69]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MemoizationQFunction(env, learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.RandomActionPolicy()

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=2000, debug_verbosity=0)

# Test.
policy = q_learning_impl_v2.MaxValuePolicy()
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20, debug_verbosity=10)

GET: ([10], 0) -> 18.74918778124353
GET: ([10], 1) -> 16.805799929635178
GET: ([10], 2) -> 16.767478513474323
Action 0: ([10]) -> ([9]), reward: 1.0
GET: ([9], 0) -> 18.690654121968542
GET: ([9], 1) -> 16.755417279948894
GET: ([9], 2) -> 16.80580008462163
GET: ([10], 0) -> 18.74918778124353
SET: ([10], 0) <- 18.755428052407456
GET: ([9], 0) -> 18.690654121968542
GET: ([9], 1) -> 16.755417279948894
GET: ([9], 2) -> 16.80580008462163
Action 0: ([9]) -> ([8]), reward: 1.0
GET: ([8], 0) -> 18.63017051205492
GET: ([8], 1) -> 16.698653968445157
GET: ([8], 2) -> 16.74926599912424
GET: ([9], 0) -> 18.690654121968542
SET: ([9], 0) <- 18.69786120000381
GET: ([8], 0) -> 18.63017051205492
GET: ([8], 1) -> 16.698653968445157
GET: ([8], 2) -> 16.74926599912424
Action 0: ([8]) -> ([7]), reward: 1.0
GET: ([7], 0) -> 18.558214595193352
GET: ([7], 1) -> 16.630249377862487
GET: ([7], 2) -> 16.6983664865845
GET: ([8], 0) -> 18.63017051205492
SET: ([8], 0) <- 18.63029053009581
GET: ([7], 0) -> 18.558214595

In [70]:
qfunc._values

{('[0.]', 0): -0.9,
 ('[0.]', 1): -0.9,
 ('[0]', 0): 16.81113448413953,
 ('[0]', 1): 16.76932944287297,
 ('[0]', 2): 18.704793430219837,
 ('[10]', 0): 18.755428052407456,
 ('[10]', 1): 16.805799929635178,
 ('[10]', 2): 16.767478513474323,
 ('[1]', 0): 16.76949753944251,
 ('[1]', 1): 16.70479344987841,
 ('[1]', 2): 18.63664742791818,
 ('[2]', 0): 16.661546960032883,
 ('[2]', 1): 16.638919241763105,
 ('[2]', 2): 18.571498684648986,
 ('[3]', 0): 16.641759338883254,
 ('[3]', 1): 16.540572009478222,
 ('[3]', 2): 18.497868776567387,
 ('[4]', 0): 16.571616357761524,
 ('[4]', 1): 16.497835468411928,
 ('[4]', 2): 18.489420183917414,
 ('[5]', 0): 16.552774274303278,
 ('[5]', 1): 19.20335285048735,
 ('[5]', 2): 16.55272820401801,
 ('[6]', 0): 18.48941881143937,
 ('[6]', 1): 16.49657950709374,
 ('[6]', 2): 16.629201592636615,
 ('[7]', 0): 18.558788088657707,
 ('[7]', 1): 16.630249377862487,
 ('[7]', 2): 16.6983664865845,
 ('[8]', 0): 18.63029053009581,
 ('[8]', 1): 16.698653968445157,
 ('[8]', 2):

This shows that we can learn a policy using a random policy (not necessarily with the MaxValueWithRandomness policy).

In [71]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MemoizationQFunction(env, learning_rate=0.9, discount_factor=0.95)

# Train.
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty=0.9)
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=2000, debug_verbosity=0)

# Test.
policy = q_learning_impl_v2.MaxValuePolicy()
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20, debug_verbosity=10)

GET: ([5], 0) -> 17.999999999999883
GET: ([5], 1) -> 19.99999999999988
GET: ([5], 2) -> 17.999999999999883
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> 17.999999999999883
GET: ([5], 1) -> 19.99999999999988
GET: ([5], 2) -> 17.999999999999883
GET: ([5], 1) -> 19.99999999999988
SET: ([5], 1) <- 19.99999999999988
GET: ([5], 0) -> 17.999999999999883
GET: ([5], 1) -> 19.99999999999988
GET: ([5], 2) -> 17.999999999999883
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> 17.999999999999883
GET: ([5], 1) -> 19.99999999999988
GET: ([5], 2) -> 17.999999999999883
GET: ([5], 1) -> 19.99999999999988
SET: ([5], 1) <- 19.99999999999988
GET: ([5], 0) -> 17.999999999999883
GET: ([5], 1) -> 19.99999999999988
GET: ([5], 2) -> 17.999999999999883
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> 17.999999999999883
GET: ([5], 1) -> 19.99999999999988
GET: ([5], 2) -> 17.999999999999883
GET: ([5], 1) -> 19.99999999999988
SET: ([5], 1) <- 19.99999999999988
GET: ([5], 0) -> 17.99999999999988

In [72]:
qfunc._values

{('[0.]', 0): -0.9,
 ('[0]', 0): -0.9,
 ('[0]', 1): -0.9,
 ('[0]', 2): 0.9,
 ('[10]', 0): 0.9,
 ('[10]', 2): -0.9,
 ('[1]', 0): -0.9,
 ('[1]', 1): -0.99,
 ('[1]', 2): 0.9,
 ('[2]', 0): -0.9,
 ('[2]', 1): -0.9,
 ('[2]', 2): 0.9,
 ('[3]', 0): -0.9,
 ('[3]', 1): -0.9,
 ('[3]', 2): 19.99998089999988,
 ('[4]', 0): 17.999001269999887,
 ('[4]', 1): -0.9,
 ('[4]', 2): 19.99999999999988,
 ('[5]', 0): 17.999999999999883,
 ('[5]', 1): 19.99999999999988,
 ('[5]', 2): 17.999999999999883,
 ('[6]', 0): 19.99999999999988,
 ('[6]', 1): 17.819999999999887,
 ('[6]', 2): 17.99165473040157,
 ('[7]', 0): 19.99980899929089,
 ('[8]', 0): 0.9,
 ('[9]', 0): 0.9}

MaxValueWithRandomnessPolicy of course works as well.

### Some more random trials

Try that if increasing input space helps the problem. We do NOT want value at (for example) (4, 1) to change when we update (5, 1), which is one problem from previous tries.

In [15]:
STATE_ZERO_ARRAY = np.zeros(1, dtype=int)
TARGET_STATE_10 = 50


class CircularWorld10(q_learning_v2.Environment):
    
    def __init__(self):
        super().__init__(state_array_size=1, action_space_size=3)
                
        self.debug_verbosity = 0
        
        # action encoding
        self._action_minus = 0
        self._action_stay = 1
        self._action_plus = 2

        
    #@ Override
    def TakeAction(self, action: q_learning_v2.Action) -> q_learning_v2.Reward:
        current_state = self.GetState()
        new_state = current_state
        reward = 0
        if action == self._action_plus:
            if current_state < TARGET_STATE_10:
                reward = 1.0
            else:
                reward = -1.0
            new_state = current_state + 10
            if new_state == 110:
                new_state = STATE_ZERO_ARRAY
        elif action == self._action_minus:
            if current_state > TARGET_STATE_10:
                reward = 1.0
            else:
                reward = -1.0
            new_state = current_state - 10
            if new_state == -10:
                new_state = STATE_ZERO_ARRAY + 100
        else:
            if current_state != TARGET_STATE_10:
                reward = -1.0
            else:
                reward = 1.0

        self._protected_SetState(new_state)
        if self.debug_verbosity >= 1:
            print('Action %s: (%s) -> (%s), reward: %s' % (
                action, current_state, new_state, reward))
        return reward

In [18]:
%%time

env = CircularWorld10()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), activation='tanh', learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20, debug_verbosity=10)

GET: ([40], 0) -> [[-0.30476415]]
GET: ([40], 1) -> [[-0.96148694]]
GET: ([40], 2) -> [[-0.15803918]]
Action 2: ([40]) -> ([50]), reward: 1.0
GET: ([50], 0) -> [[-0.30476415]]
GET: ([50], 1) -> [[-0.96148694]]
GET: ([50], 2) -> [[-0.15803918]]
GET: ([40], 2) -> [[-0.15803918]]
SET: ([40], 2) <- [[0.74907255]]
GET: ([50], 0) -> [[-0.30476415]]
GET: ([50], 1) -> [[-0.96148694]]
GET: ([50], 2) -> [[-0.10007073]]
Action 2: ([50]) -> ([60]), reward: -1.0
GET: ([60], 0) -> [[-0.30476415]]
GET: ([60], 1) -> [[-0.96148694]]
GET: ([60], 2) -> [[-0.10007073]]
GET: ([50], 2) -> [[-0.10007073]]
SET: ([50], 2) <- [[-0.9955675]]
GET: ([60], 0) -> [[-0.30476415]]
GET: ([60], 1) -> [[-0.96148694]]
GET: ([60], 2) -> [[-0.15626049]]
Action 2: ([60]) -> ([70]), reward: -1.0
GET: ([70], 0) -> [[-0.30476415]]
GET: ([70], 1) -> [[-0.96148694]]
GET: ([70], 2) -> [[-0.15626049]]
GET: ([60], 2) -> [[-0.15626049]]
SET: ([60], 2) <- [[-1.0492288]]
GET: ([70], 0) -> [[-0.30476415]]
GET: ([70], 1) -> [[-0.96148694

Increase input value space does not work; changing activation function to tanh does not work.

Next try deep learning again on the two-state learning.

In [11]:
STATE_ZERO_ARRAY = np.zeros(1, dtype=int)
TARGET_STATE = 5


class CircularWorld2Action(q_learning_v2.Environment):
    
    def __init__(self):
        super().__init__(state_array_size=1, action_space_size=2)

        self.debug_verbosity = 0
        
        # action encoding
        self._action_minus = 0
        self._action_plus = 1

        
    #@ Override
    def TakeAction(self, action: q_learning_v2.Action) -> q_learning_v2.Reward:
        current_state = self.GetState()
        new_state = current_state
        reward = 0
        if action == self._action_plus:
            if current_state < TARGET_STATE:
                reward = 1.0
            else:
                reward = -1.0
            new_state = current_state + 1
            if new_state == 11:
                new_state = STATE_ZERO_ARRAY
        else:
            if current_state > TARGET_STATE:
                reward = 1.0
            else:
                reward = -1.0
            new_state = current_state - 1
            if new_state == -1:
                new_state = STATE_ZERO_ARRAY + 10

        self._protected_SetState(new_state)
        if self.debug_verbosity >= 1:
            print('Action %s: (%s) -> (%s), reward: %s' % (
                action, current_state, new_state, reward))
        return reward

In [35]:
%%time

env = CircularWorld2Action()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), activation='tanh', learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=10, debug_verbosity=10)

GET: ([5], 0) -> [[0.0344843]]
GET: ([5], 1) -> [[-1.0452266]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[0.03448432]]
GET: ([4], 1) -> [[0.60384464]]
GET: ([5], 0) -> [[0.0344843]]
SET: ([5], 0) <- [[-0.3802644]]
GET: ([4], 0) -> [[0.02004706]]
GET: ([4], 1) -> [[0.60384464]]
Action 1: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[0.02004701]]
GET: ([5], 1) -> [[-1.0452266]]
GET: ([4], 1) -> [[0.60384464]]
SET: ([4], 1) <- [[0.9775246]]
GET: ([5], 0) -> [[0.02004701]]
GET: ([5], 1) -> [[-0.2684028]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[0.02004706]]
GET: ([4], 1) -> [[1.4096581]]
GET: ([5], 0) -> [[0.02004701]]
SET: ([5], 0) <- [[0.3072623]]
GET: ([4], 0) -> [[0.0300447]]
GET: ([4], 1) -> [[1.4096581]]
Action 1: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[0.03004467]]
GET: ([5], 1) -> [[-0.2684028]]
GET: ([4], 1) -> [[1.4096581]]
SET: ([4], 1) <- [[1.066654]]
GET: ([5], 0) -> [[0.03004467]]
GET: ([5], 1) -> [[-0.765144]]
Action 0: ([5]) -> ([4]),

The two-state learning *sometimes* works; I think the 3-state learning does not work because the reward function is very singular for the "stay" action. Try to make it less singular to see if it works.

In [21]:
STATE_ZERO_ARRAY = np.zeros(1, dtype=int)
TARGET_STATE = 5


class CircularWorldContinuous(q_learning_v2.Environment):
    
    def __init__(self):
        super().__init__(state_array_size=1, action_space_size=3)

        self.debug_verbosity = 0
        
        # action encoding
        self._action_minus = 0
        self._action_stay = 1
        self._action_plus = 2

        
    #@ Override
    def TakeAction(self, action: q_learning_v2.Action) -> q_learning_v2.Reward:
        current_state = self.GetState()
        new_state = current_state
        reward = 0
        if action == self._action_plus:
            if current_state < TARGET_STATE:
                reward = 1.0
            else:
                reward = -1.0
            new_state = current_state + 1
            if new_state == 11:
                new_state = STATE_ZERO_ARRAY
        elif action == self._action_minus:
            if current_state > TARGET_STATE:
                reward = 1.0
            else:
                reward = -1.0
            new_state = current_state - 1
            if new_state == -1:
                new_state = STATE_ZERO_ARRAY + 10
        else:
            reward = -abs(current_state - TARGET_STATE) + 4

        self._protected_SetState(new_state)
        if self.debug_verbosity >= 1:
            print('Action %s: (%s) -> (%s), reward: %s' % (
                action, current_state, new_state, reward))
        return reward

In [39]:
%%time

env = CircularWorldContinuous()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), activation='relu', learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=2000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=10, debug_verbosity=10)

GET: ([6], 0) -> [[-19372.236]]
GET: ([6], 1) -> [[59.997402]]
GET: ([6], 2) -> [[18.348532]]
Action 1: ([6]) -> ([6]), reward: [3]
GET: ([6], 0) -> [[-19372.236]]
GET: ([6], 1) -> [[59.997402]]
GET: ([6], 2) -> [[18.348532]]
GET: ([6], 1) -> [[59.997402]]
SET: ([6], 1) <- [[59.99751883]]
GET: ([6], 0) -> [[-19372.236]]
GET: ([6], 1) -> [[59.998745]]
GET: ([6], 2) -> [[18.348532]]
<use random choice>
Action 2: ([6]) -> ([7]), reward: -1.0
GET: ([7], 0) -> [[-19372.236]]
GET: ([7], 1) -> [[59.998745]]
GET: ([7], 2) -> [[18.348532]]
GET: ([6], 2) -> [[18.348532]]
SET: ([6], 2) <- [[52.23378]]
GET: ([7], 0) -> [[-19372.236]]
GET: ([7], 1) -> [[59.998745]]
GET: ([7], 2) -> [[19.026237]]
Action 1: ([7]) -> ([7]), reward: [2]
GET: ([7], 0) -> [[-19372.236]]
GET: ([7], 1) -> [[59.998745]]
GET: ([7], 2) -> [[19.026237]]
GET: ([7], 1) -> [[59.998745]]
SET: ([7], 1) <- [[59.09879999]]
GET: ([7], 0) -> [[-19372.236]]
GET: ([7], 1) -> [[50.416096]]
GET: ([7], 2) -> [[19.026237]]
Action 1: ([7]) ->

In [2]:
STATE_ZERO_ARRAY = np.zeros(1, dtype=int)
TARGET_STATE = 5


class CircularWorldAllActionContinuous(q_learning_v2.Environment):
    
    def __init__(self):
        super().__init__(state_array_size=1, action_space_size=3)

        self.debug_verbosity = 0
        
        # action encoding
        self._action_minus = 0
        self._action_stay = 1
        self._action_plus = 2

        
    #@ Override
    def TakeAction(self, action: q_learning_v2.Action) -> q_learning_v2.Reward:
        current_state = self.GetState()
        new_state = current_state
        if action == self._action_plus:
            new_state = current_state + 1
            if new_state == 11:
                new_state = STATE_ZERO_ARRAY
        elif action == self._action_minus:
            new_state = current_state - 1
            if new_state == -1:
                new_state = STATE_ZERO_ARRAY + 10
                
        reward = -(abs(current_state - TARGET_STATE)**2) + 5

        self._protected_SetState(new_state)
        if self.debug_verbosity >= 1:
            print('Action %s: (%s) -> (%s), reward: %s' % (
                action, current_state, new_state, reward))
        return reward

In [45]:
%%time

env = CircularWorldAllActionContinuous()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), activation='relu', learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.9)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=3000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=10, debug_verbosity=10)

GET: ([6], 0) -> [[-154.04008]]
GET: ([6], 1) -> [[-152.46977]]
GET: ([6], 2) -> [[-129.05731]]
Action 2: ([6]) -> ([7]), reward: [4]
GET: ([7], 0) -> [[-154.04008]]
GET: ([7], 1) -> [[-152.46977]]
GET: ([7], 2) -> [[-129.05731]]
GET: ([6], 2) -> [[-129.05731]]
SET: ([6], 2) <- [[-119.64973297]]
GET: ([7], 0) -> [[-154.04008]]
GET: ([7], 1) -> [[-152.46977]]
GET: ([7], 2) -> [[-128.86916]]
Action 2: ([7]) -> ([8]), reward: [1]
GET: ([8], 0) -> [[-154.04008]]
GET: ([8], 1) -> [[-152.46977]]
GET: ([8], 2) -> [[-128.86916]]
GET: ([7], 2) -> [[-128.86916]]
SET: ([7], 2) <- [[-122.17004375]]
GET: ([8], 0) -> [[-154.04008]]
GET: ([8], 1) -> [[-152.46977]]
GET: ([8], 2) -> [[-128.73517]]
Action 2: ([8]) -> ([9]), reward: [-4]
GET: ([9], 0) -> [[-154.04008]]
GET: ([9], 1) -> [[-152.46977]]
GET: ([9], 2) -> [[-128.73517]]
GET: ([8], 2) -> [[-128.73517]]
SET: ([8], 2) <- [[-126.54208469]]
GET: ([9], 0) -> [[-154.04008]]
GET: ([9], 1) -> [[-152.46977]]
GET: ([9], 2) -> [[-128.6913]]
Action 2: ([9

In [7]:
%%time

env = CircularWorld2Action()
qfunc = q_learning_impl_v2.MultiModelQFunctionBatchWrite(
    env, (), (20, 20, 20), 1000, activation='relu', learning_rate=0.9, discount_factor=0.95)
policy = policy_impl.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=10000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20, debug_verbosity=10)

GET: ([5], 0) -> [[-0.80804]]
GET: ([5], 1) -> [[-1.3298868]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.80804]]
GET: ([4], 1) -> [[0.4455886]]
GET: ([5], 0) -> [[-0.80804]]
[PENDING] SET: ([5], 0) <- [[-0.59982574]]
GET: ([4], 0) -> [[-0.80804]]
GET: ([4], 1) -> [[0.4455886]]
Action 1: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-0.80804]]
GET: ([5], 1) -> [[-1.3298868]]
GET: ([4], 1) -> [[0.4455886]]
[PENDING] SET: ([4], 1) <- [[0.25368464]]
GET: ([5], 0) -> [[-0.80804]]
GET: ([5], 1) -> [[-1.3298868]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.80804]]
GET: ([4], 1) -> [[0.4455886]]
GET: ([5], 0) -> [[-0.80804]]
[PENDING] SET: ([5], 0) <- [[-0.59982574]]
GET: ([4], 0) -> [[-0.80804]]
GET: ([4], 1) -> [[0.4455886]]
Action 1: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-0.80804]]
GET: ([5], 1) -> [[-1.3298868]]
GET: ([4], 1) -> [[0.4455886]]
[PENDING] SET: ([4], 1) <- [[0.25368464]]
GET: ([5], 0) -> [[-0.80804]]
GET: ([5], 1) -> [[-1.3298868]]


In [9]:
%%time

env = CircularWorld2Action()
qfunc = q_learning_impl_v2.MultiModelQFunctionBatchWrite(
    env, (), (20, 20, 20), 1000, activation='relu', learning_rate=0.9, discount_factor=0.95)
policy = policy_impl.MaxValueWithRandomnessPolicy(certainty = 0.9)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20, debug_verbosity=10)

GET: ([2], 0) -> [[-0.2477171]]
GET: ([2], 1) -> [[-0.26044938]]
Action 0: ([2]) -> ([1]), reward: -1.0
GET: ([1], 0) -> [[-0.2477171]]
GET: ([1], 1) -> [[-0.26044938]]
GET: ([2], 0) -> [[-0.2477171]]
[PENDING] SET: ([2], 0) <- [[-1.1365699]]
GET: ([1], 0) -> [[-0.2477171]]
GET: ([1], 1) -> [[-0.26044938]]
Action 0: ([1]) -> ([0]), reward: -1.0
GET: ([0], 0) -> [[-0.2477171]]
GET: ([0], 1) -> [[-0.26044938]]
GET: ([1], 0) -> [[-0.2477171]]
[PENDING] SET: ([1], 0) <- [[-1.1365699]]
GET: ([0], 0) -> [[-0.2477171]]
GET: ([0], 1) -> [[-0.26044938]]
Action 0: ([0]) -> ([10]), reward: -1.0
GET: ([10], 0) -> [[-0.2477171]]
GET: ([10], 1) -> [[-0.26044938]]
GET: ([0], 0) -> [[-0.2477171]]
[PENDING] SET: ([0], 0) <- [[-1.1365699]]
GET: ([10], 0) -> [[-0.2477171]]
GET: ([10], 1) -> [[-0.26044938]]
Action 0: ([10]) -> ([9]), reward: 1.0
GET: ([9], 0) -> [[-0.2477171]]
GET: ([9], 1) -> [[-0.26044938]]
GET: ([10], 0) -> [[-0.2477171]]
[PENDING] SET: ([10], 0) <- [[0.66343015]]
GET: ([9], 0) -> [[-0

In [64]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), activation='relu', learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=2000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20, debug_verbosity=10)

GET: ([5], 0) -> [[-2.013604]]
GET: ([5], 1) -> [[-2.6139748]]
GET: ([5], 2) -> [[-1.5695232]]
Action 2: ([5]) -> ([6]), reward: -1.0
GET: ([6], 0) -> [[-1.5919058]]
GET: ([6], 1) -> [[-2.6139748]]
GET: ([6], 2) -> [[-1.5695232]]
GET: ([5], 2) -> [[-1.5695232]]
SET: ([5], 2) <- [[-2.3988945]]
GET: ([6], 0) -> [[-1.5919058]]
GET: ([6], 1) -> [[-2.6139748]]
GET: ([6], 2) -> [[-1.5861106]]
Action 2: ([6]) -> ([7]), reward: -1.0
GET: ([7], 0) -> [[-1.1701856]]
GET: ([7], 1) -> [[-2.6139748]]
GET: ([7], 2) -> [[-1.5861106]]
GET: ([6], 2) -> [[-1.5861106]]
SET: ([6], 2) <- [[-2.0591197]]
GET: ([7], 0) -> [[-1.1701856]]
GET: ([7], 1) -> [[-2.6139748]]
GET: ([7], 2) -> [[-1.5955708]]
Action 0: ([7]) -> ([6]), reward: 1.0
GET: ([6], 0) -> [[-1.5919058]]
GET: ([6], 1) -> [[-2.6139748]]
GET: ([6], 2) -> [[-1.5955708]]
GET: ([7], 0) -> [[-1.1701856]]
SET: ([7], 0) <- [[-0.578098]]
GET: ([6], 0) -> [[-1.105035]]
GET: ([6], 1) -> [[-2.6139748]]
GET: ([6], 2) -> [[-1.5955708]]
Action 0: ([6]) -> ([5]

In [65]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), activation='relu', learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=2000, debug_verbosity=0)

# Test.
# q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20, debug_verbosity=10)

CPU times: user 2min 15s, sys: 595 ms, total: 2min 16s
Wall time: 2min 22s


In [66]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), activation='relu', learning_rate=0.9, discount_factor=0.95)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
# q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=2000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20, debug_verbosity=10)

GET: ([0.], 0) -> [[0.]]
GET: ([0.], 1) -> [[0.]]
GET: ([0.], 2) -> [[0.]]
Action 0: ([0.]) -> ([10]), reward: -1.0
GET: ([10], 0) -> [[-1.7132287]]
GET: ([10], 1) -> [[0.]]
GET: ([10], 2) -> [[0.]]
GET: ([0.], 0) -> [[0.]]
SET: ([0.], 0) <- [[-0.9]]
GET: ([10], 0) -> [[-1.7312287]]
GET: ([10], 1) -> [[0.]]
GET: ([10], 2) -> [[0.]]
Action 1: ([10]) -> ([10]), reward: -1.0
GET: ([10], 0) -> [[-1.7312287]]
GET: ([10], 1) -> [[0.]]
GET: ([10], 2) -> [[0.]]
GET: ([10], 1) -> [[0.]]
SET: ([10], 1) <- [[-0.9]]
GET: ([10], 0) -> [[-1.7312287]]
GET: ([10], 1) -> [[-0.018]]
GET: ([10], 2) -> [[0.]]
Action 2: ([10]) -> ([0]), reward: -1.0
GET: ([0], 0) -> [[-0.018]]
GET: ([0], 1) -> [[-0.018]]
GET: ([0], 2) -> [[0.]]
GET: ([10], 2) -> [[0.]]
SET: ([10], 2) <- [[-0.9]]
GET: ([0], 0) -> [[-0.018]]
GET: ([0], 1) -> [[-0.018]]
GET: ([0], 2) -> [[-0.018]]
Action 0: ([0]) -> ([10]), reward: -1.0
GET: ([10], 0) -> [[-1.7312287]]
GET: ([10], 1) -> [[-0.018]]
GET: ([10], 2) -> [[-0.018]]
GET: ([0], 0) ->

In [4]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunctionMultiFitPerSet(
    env, (), (20, 20, 20), activation='relu', learning_rate=0.9, discount_factor=0.95)
policy = policy_impl.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20, debug_verbosity=10)

GET: ([7], 0) -> [[-6.485604]]
GET: ([7], 1) -> [[-5.0983176]]
GET: ([7], 2) -> [[-3.2187824]]
Action 2: ([7]) -> ([8]), reward: -1.0
GET: ([8], 0) -> [[-6.485604]]
GET: ([8], 1) -> [[-5.0983176]]
GET: ([8], 2) -> [[-3.2187824]]
GET: ([7], 2) -> [[-3.2187824]]
SET: ([7], 2) <- [[-3.973937]]
GET: ([8], 0) -> [[-6.485604]]
GET: ([8], 1) -> [[-5.0983176]]
GET: ([8], 2) -> [[-3.4697893]]
Action 2: ([8]) -> ([9]), reward: -1.0
GET: ([9], 0) -> [[-6.485604]]
GET: ([9], 1) -> [[-5.0983176]]
GET: ([9], 2) -> [[-3.4697893]]
GET: ([8], 2) -> [[-3.4697893]]
SET: ([8], 2) <- [[-4.213649]]
GET: ([9], 0) -> [[-6.485604]]
GET: ([9], 1) -> [[-5.0983176]]
GET: ([9], 2) -> [[-3.717042]]
Action 2: ([9]) -> ([10]), reward: -1.0
GET: ([10], 0) -> [[-6.485604]]
GET: ([10], 1) -> [[-5.0983176]]
GET: ([10], 2) -> [[-3.717042]]
GET: ([9], 2) -> [[-3.717042]]
SET: ([9], 2) <- [[-4.4497747]]
GET: ([10], 0) -> [[-6.485604]]
GET: ([10], 1) -> [[-5.0983176]]
GET: ([10], 2) -> [[-3.9605958]]
Action 2: ([10]) -> ([0]

Batch write single model:

In [15]:
%%time

env = CircularWorld2Action()
qfunc = q_learning_impl_v2.KerasModelQFunctionBatchWrite(
    env, (6, 20, 20), 1000, learning_rate=0.9, discount_factor=0.95)
policy = policy_impl.MaxValueWithRandomnessPolicy(certainty = 0.9)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20, debug_verbosity=10)

GET: ([5], 0) -> [[-0.35422117]]
GET: ([5], 1) -> [[-0.995958]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.6098675]]
GET: ([4], 1) -> [[0.4809596]]
GET: ([5], 0) -> [[-0.35422117]]
[PENDING] SET: ([5], 0) <- [[-0.52420163]]
GET: ([4], 0) -> [[-0.6098675]]
GET: ([4], 1) -> [[0.4809596]]
Action 1: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-0.35422117]]
GET: ([5], 1) -> [[-0.995958]]
GET: ([4], 1) -> [[0.4809596]]
[PENDING] SET: ([4], 1) <- [[0.64523685]]
GET: ([5], 0) -> [[-0.35422117]]
GET: ([5], 1) -> [[-0.995958]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.6098675]]
GET: ([4], 1) -> [[0.4809596]]
GET: ([5], 0) -> [[-0.35422117]]
[PENDING] SET: ([5], 0) <- [[-0.52420163]]
GET: ([4], 0) -> [[-0.6098675]]
GET: ([4], 1) -> [[0.4809596]]
Action 1: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-0.35422117]]
GET: ([5], 1) -> [[-0.995958]]
GET: ([4], 1) -> [[0.4809596]]
[PENDING] SET: ([4], 1) <- [[0.64523685]]
GET: ([5], 0) -> [[-0.35422117]]
GET: ([

In [16]:
%%time

env = CircularWorld2Action()
qfunc = q_learning_impl_v2.KerasModelQFunctionBatchWrite(
    env, (6, 20, 20), 1000, learning_rate=0.9, discount_factor=0.95)
policy = policy_impl.MaxValueWithRandomnessPolicy(certainty = 0.9)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=19, debug_verbosity=10)

GET: ([5], 0) -> [[-0.7222531]]
GET: ([5], 1) -> [[-0.84897065]]
<use random choice>
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.6466589]]
GET: ([4], 1) -> [[0.2834884]]
GET: ([5], 0) -> [[-0.7222531]]
[PENDING] SET: ([5], 0) <- [[-0.7298428]]
GET: ([4], 0) -> [[-0.6466589]]
GET: ([4], 1) -> [[0.2834884]]
Action 1: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-0.7222531]]
GET: ([5], 1) -> [[-0.84897065]]
GET: ([4], 1) -> [[0.2834884]]
[PENDING] SET: ([4], 1) <- [[0.31082246]]
GET: ([5], 0) -> [[-0.7222531]]
GET: ([5], 1) -> [[-0.84897065]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.6466589]]
GET: ([4], 1) -> [[0.2834884]]
GET: ([5], 0) -> [[-0.7222531]]
[PENDING] SET: ([5], 0) <- [[-0.7298428]]
GET: ([4], 0) -> [[-0.6466589]]
GET: ([4], 1) -> [[0.2834884]]
Action 1: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-0.7222531]]
GET: ([5], 1) -> [[-0.84897065]]
GET: ([4], 1) -> [[0.2834884]]
[PENDING] SET: ([4], 1) <- [[0.31082246]]
GET: ([5], 0) -> [[-

In [20]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.KerasModelQFunctionBatchWrite(
    env, (6, 20, 20), 1000, learning_rate=0.9, discount_factor=0.95)
policy = policy_impl.MaxValueWithRandomnessPolicy(certainty = 0.9)

# Train.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=20000, debug_verbosity=0)

# Test.
q_learning_v2.Run(env=env, qfunc=qfunc, policy=policy, num_of_runs=19, debug_verbosity=10)

GET: ([5], 0) -> [[-0.7379969]]
GET: ([5], 1) -> [[-0.77019966]]
GET: ([5], 2) -> [[-1.3602489]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.8525162]]
GET: ([4], 1) -> [[-0.88471895]]
GET: ([4], 2) -> [[0.4358304]]
GET: ([5], 0) -> [[-0.7379969]]
[PENDING] SET: ([5], 0) <- [[-0.60116464]]
GET: ([4], 0) -> [[-0.8525162]]
GET: ([4], 1) -> [[-0.88471895]]
GET: ([4], 2) -> [[0.4358304]]
Action 2: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-0.7379969]]
GET: ([5], 1) -> [[-0.77019966]]
GET: ([5], 2) -> [[-1.3602489]]
GET: ([4], 2) -> [[0.4358304]]
[PENDING] SET: ([4], 2) <- [[0.31259573]]
GET: ([5], 0) -> [[-0.7379969]]
GET: ([5], 1) -> [[-0.77019966]]
GET: ([5], 2) -> [[-1.3602489]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.8525162]]
GET: ([4], 1) -> [[-0.88471895]]
GET: ([4], 2) -> [[0.4358304]]
GET: ([5], 0) -> [[-0.7379969]]
[PENDING] SET: ([5], 0) <- [[-0.60116464]]
GET: ([4], 0) -> [[-0.8525162]]
GET: ([4], 1) -> [[-0.88471895]]
GET: ([4], 2) -> 