In [81]:
ReloadProject('deep_learning')

notebook_init.py imported and reloaded
forwarded symbol: Activation
forwarded symbol: Dense
forwarded symbol: Dict
forwarded symbol: InputLayer
forwarded symbol: List
forwarded symbol: Model
forwarded symbol: Sequential
forwarded symbol: Tuple
reloaded: gym
forwarded symbol: gym
reloaded: keras
forwarded symbol: keras
reloaded: q_learning
forwarded symbol: q_learning
reloaded: q_learning_impl
forwarded symbol: q_learning_impl
reloaded: q_learning_impl_v2
forwarded symbol: q_learning_impl_v2
reloaded: q_learning_v2
forwarded symbol: q_learning_v2


## Environment Setup
Let's assume a world with 11 states: 0-10. Each time the agent and move +1 or -1, with 0-1 -> 10 and 10+1 -> 0. All actions that gets the agent closer to state "5" gets reward +1, otherwise gets reward -1.

In [4]:
STATE_ZERO_ARRAY = np.zeros(1, dtype=int)
TARGET_STATE = 5


class CircularWorld(q_learning_v2.Environment):
    
    def __init__(self):
        super().__init__(state_array_size=1, action_space_size=3)
                
        self.debug_verbosity = 0
        
        # action encoding
        self._action_minus = 0
        self._action_stay = 1
        self._action_plus = 2

        
    #@ Override
    def TakeAction(self, action: q_learning_v2.Action) -> q_learning_v2.Reward:
        current_state = self.GetState()
        new_state = current_state
        reward = 0
        if action == self._action_plus:
            if current_state < TARGET_STATE:
                reward = 1.0
            else:
                reward = -1.0
            new_state = current_state + 1
            if new_state == 11:
                new_state = STATE_ZERO_ARRAY
        elif action == self._action_minus:
            if current_state > TARGET_STATE:
                reward = 1.0
            else:
                reward = -1.0
            new_state = current_state - 1
            if new_state == -1:
                new_state = STATE_ZERO_ARRAY + 10
        else:
            if current_state != TARGET_STATE:
                reward = -1.0
            else:
                reward = 1.0

        self._protected_SetState(new_state)
        if self.debug_verbosity >= 1:
            print('Action %s: (%s) -> (%s), reward: %s' % (
                action, current_state, new_state, reward))
        return reward

Let's try out the environment.

In [3]:
env = CircularWorld()
env.debug_verbosity = 10
for _ in range(20):
    env.TakeAction(np.random.choice(env.GetActionSpace()))

Action 0: ([0.]) -> ([10]), reward: -1.0
Action 0: ([10]) -> ([9]), reward: 1.0
Action 0: ([9]) -> ([8]), reward: 1.0
Action 2: ([8]) -> ([9]), reward: -1.0
Action 1: ([9]) -> ([9]), reward: -1.0
Action 1: ([9]) -> ([9]), reward: -1.0
Action 2: ([9]) -> ([10]), reward: -1.0
Action 1: ([10]) -> ([10]), reward: -1.0
Action 2: ([10]) -> ([0]), reward: -1.0
Action 2: ([0]) -> ([1]), reward: 1.0
Action 2: ([1]) -> ([2]), reward: 1.0
Action 1: ([2]) -> ([2]), reward: -1.0
Action 2: ([2]) -> ([3]), reward: 1.0
Action 2: ([3]) -> ([4]), reward: 1.0
Action 2: ([4]) -> ([5]), reward: 1.0
Action 1: ([5]) -> ([5]), reward: 1.0
Action 1: ([5]) -> ([5]), reward: 1.0
Action 2: ([5]) -> ([6]), reward: -1.0
Action 2: ([6]) -> ([7]), reward: -1.0
Action 1: ([7]) -> ([7]), reward: -1.0


## Learning

### Single model approach

In [37]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.KerasModelQFunction(
    env, (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Training.
for _ in range(4000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)
    
# Testing.
env.debug_verbosity = 5
qfunc.SetDebugVerbosity(5)
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([3], 0) -> [[-0.20680887]]
GET: ([3], 1) -> [[-0.20680887]]
GET: ([3], 2) -> [[0.8173575]]
<use random choice>
Action 0: ([3]) -> ([2]), reward: -1.0
GET: ([2], 0) -> [[-0.20680887]]
GET: ([2], 1) -> [[0.10594125]]
GET: ([2], 2) -> [[1.6265708]]
GET: ([3], 0) -> [[-0.20680887]]
SET: ([3], 0) <- [[0.39684144]]
GET: ([2], 0) -> [[-0.18448368]]
GET: ([2], 1) -> [[0.13334492]]
GET: ([2], 2) -> [[1.6477598]]
<use random choice>
Action 1: ([2]) -> ([2]), reward: -1.0
GET: ([2], 0) -> [[-0.18448368]]
GET: ([2], 1) -> [[0.13334492]]
GET: ([2], 2) -> [[1.6477598]]
GET: ([2], 1) -> [[0.13334492]]
SET: ([2], 1) <- [[0.44801992]]
GET: ([2], 0) -> [[-0.16990179]]
GET: ([2], 1) -> [[0.3333642]]
GET: ([2], 2) -> [[1.7631308]]
Action 2: ([2]) -> ([3]), reward: 1.0
GET: ([3], 0) -> [[-0.16990179]]
GET: ([3], 1) -> [[-0.16990179]]
GET: ([3], 2) -> [[0.97960705]]
GET: ([2], 2) -> [[1.7631308]]
SET: ([2], 2) <- [[1.8697947]]
GET: ([3], 0) -> [[-0.16578543]]
GET: ([3], 1) -> [[-0.16578543]]
GET: ([3]

#### Conclusion

This approach fails for model with structure (6, 6, 6) and for model with structure (20, 20, 20), probably because changing the value for one action also changes the values for other actions, making it either long to train the model or other tricks might be needed.

### Multi-head model approach
Next let's try to use a multi-head model. In this case each action has its own model and we only update the weights in the model the policy picked to avoid changing weight for the models for other actions. The models share some common layers to possibly support common features.

In [35]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
for _ in range(4000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.788313]]
GET: ([3], 2) -> [[1.7116532]]
Action 1: ([3]) -> ([3]), reward: -1.0
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.788313]]
GET: ([3], 2) -> [[1.7116532]]
GET: ([3], 1) -> [[1.788313]]
SET: ([3], 1) <- [[0.7273648]]
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.767094]]
GET: ([3], 2) -> [[1.7116532]]
Action 1: ([3]) -> ([3]), reward: -1.0
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.767094]]
GET: ([3], 2) -> [[1.7116532]]
GET: ([3], 1) -> [[1.767094]]
SET: ([3], 1) <- [[0.70805556]]
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.7459133]]
GET: ([3], 2) -> [[1.7116532]]
Action 1: ([3]) -> ([3]), reward: -1.0
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.7459133]]
GET: ([3], 2) -> [[1.7116532]]
GET: ([3], 1) -> [[1.7459133]]
SET: ([3], 1) <- [[0.688781]]
GET: ([3], 0) -> [[0.05665702]]
GET: ([3], 1) -> [[1.7247707]]
GET: ([3], 2) -> [[1.7116532]]
Action 1: ([3]) -> ([3]), reward: -1.0
GET: (

Note: this approach does not work when the towers have the "wrong" shape, like (3, 3, 3), which is not a good model to fit a sophisticated function. See "keras_function_fit_discountinous" notebook.

In [39]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (200,), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
for _ in range(4000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([5], 0) -> [[-0.27712864]]
GET: ([5], 1) -> [[-0.49954152]]
GET: ([5], 2) -> [[-1.7212956]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.27712864]]
GET: ([4], 1) -> [[-0.4995423]]
GET: ([4], 2) -> [[-0.10966682]]
GET: ([5], 0) -> [[-0.27712864]]
SET: ([5], 0) <- [[-1.016543]]
GET: ([4], 0) -> [[-0.2965436]]
GET: ([4], 1) -> [[-0.4995454]]
GET: ([4], 2) -> [[-0.10965633]]
Action 2: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-0.2965436]]
GET: ([5], 1) -> [[-0.49954462]]
GET: ([5], 2) -> [[-1.7212946]]
GET: ([4], 2) -> [[-0.10965633]]
SET: ([4], 2) <- [[0.64883405]]
GET: ([5], 0) -> [[-0.2965575]]
GET: ([5], 1) -> [[-0.4997524]]
GET: ([5], 2) -> [[-0.35255474]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-0.2965575]]
GET: ([4], 1) -> [[-0.49973]]
GET: ([4], 2) -> [[0.76266855]]
GET: ([5], 0) -> [[-0.2965575]]
SET: ([5], 0) <- [[-0.31189424]]
GET: ([4], 0) -> [[-0.29696116]]
GET: ([4], 1) -> [[-0.49973005]]
GET: ([4], 2) -> [[0.76266855]]
Action 2: (

In [40]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValuePolicy()

# Train.
for _ in range(4000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([4], 0) -> [[-1.1329795]]
GET: ([4], 1) -> [[-2.8374398]]
GET: ([4], 2) -> [[-0.87958753]]
Action 2: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-1.1795522]]
GET: ([5], 1) -> [[-3.3028107]]
GET: ([5], 2) -> [[-0.87958753]]
GET: ([4], 2) -> [[-0.87958753]]
SET: ([4], 2) <- [[0.09957534]]
GET: ([5], 0) -> [[-1.1795522]]
GET: ([5], 1) -> [[-3.3028107]]
GET: ([5], 2) -> [[-0.86000425]]
Action 2: ([5]) -> ([6]), reward: -1.0
GET: ([6], 0) -> [[-1.226125]]
GET: ([6], 1) -> [[-3.768182]]
GET: ([6], 2) -> [[-0.86000425]]
GET: ([5], 2) -> [[-0.86000425]]
SET: ([5], 2) <- [[-1.6826037]]
GET: ([6], 0) -> [[-1.226125]]
GET: ([6], 1) -> [[-3.768182]]
GET: ([6], 2) -> [[-0.87645626]]
Action 2: ([6]) -> ([7]), reward: -1.0
GET: ([7], 0) -> [[-1.2726977]]
GET: ([7], 1) -> [[-4.2335534]]
GET: ([7], 2) -> [[-0.87645626]]
GET: ([6], 2) -> [[-0.87645626]]
SET: ([6], 2) <- [[-1.6975752]]
GET: ([7], 0) -> [[-1.2726977]]
GET: ([7], 1) -> [[-4.2335534]]
GET: ([7], 2) -> [[-0.89287865]]
Action 2: ([7]

In [44]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (40, 40, 40), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.RandomActionPolicy()

# Train.
for _ in range(10000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy = q_learning_impl_v2.MaxValuePolicy()
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([10], 0) -> [[5.6680374]]
GET: ([10], 1) -> [[3.3310022]]
GET: ([10], 2) -> [[3.7245948]]
Action 0: ([10]) -> ([9]), reward: 1.0
GET: ([9], 0) -> [[5.1856256]]
GET: ([9], 1) -> [[3.3310022]]
GET: ([9], 2) -> [[3.7245948]]
GET: ([10], 0) -> [[5.6680374]]
SET: ([10], 0) <- [[5.6671605]]
GET: ([9], 0) -> [[5.183048]]
GET: ([9], 1) -> [[3.3310022]]
GET: ([9], 2) -> [[3.7245948]]
Action 0: ([9]) -> ([8]), reward: 1.0
GET: ([8], 0) -> [[4.6989355]]
GET: ([8], 1) -> [[3.3310022]]
GET: ([8], 2) -> [[3.7245948]]
GET: ([9], 0) -> [[5.183048]]
SET: ([9], 0) <- [[5.2244425]]
GET: ([8], 0) -> [[4.795692]]
GET: ([8], 1) -> [[3.3310022]]
GET: ([8], 2) -> [[3.7245948]]
Action 0: ([8]) -> ([7]), reward: 1.0
GET: ([7], 0) -> [[4.2973275]]
GET: ([7], 1) -> [[3.3310022]]
GET: ([7], 2) -> [[3.7245948]]
GET: ([8], 0) -> [[4.795692]]
SET: ([8], 0) <- [[4.860404]]
GET: ([7], 0) -> [[4.420882]]
GET: ([7], 1) -> [[3.3310022]]
GET: ([7], 2) -> [[3.7245948]]
Action 0: ([7]) -> ([6]), reward: 1.0
GET: ([6], 

In [45]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
for _ in range(4000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([6], 0) -> [[-0.27988243]]
GET: ([6], 1) -> [[-1.3076097]]
GET: ([6], 2) -> [[-1.069]]
Action 0: ([6]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-1.4874269]]
GET: ([5], 1) -> [[-1.2247055]]
GET: ([5], 2) -> [[-1.069]]
GET: ([6], 0) -> [[-0.27988243]]
SET: ([6], 0) <- [[0.00612178]]
GET: ([5], 0) -> [[-1.049413]]
GET: ([5], 1) -> [[-1.2247055]]
GET: ([5], 2) -> [[-1.069]]
Action 0: ([5]) -> ([4]), reward: -1.0
GET: ([4], 0) -> [[-1.4944894]]
GET: ([4], 1) -> [[-1.1398618]]
GET: ([4], 2) -> [[-1.069]]
GET: ([5], 0) -> [[-1.049413]]
SET: ([5], 0) <- [[-1.8708313]]
GET: ([4], 0) -> [[-1.5116377]]
GET: ([4], 1) -> [[-1.1398618]]
GET: ([4], 2) -> [[-1.069]]
Action 2: ([4]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-1.5229793]]
GET: ([5], 1) -> [[-1.2247055]]
GET: ([5], 2) -> [[-1.069]]
GET: ([4], 2) -> [[-1.069]]
SET: ([4], 2) <- [[-0.07278997]]
GET: ([5], 0) -> [[-1.5229793]]
GET: ([5], 1) -> [[-1.2247055]]
GET: ([5], 2) -> [[-1.0490758]]
<use random choice>
Action 0: ([5]) -> ([4]), re

In [46]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
for _ in range(200000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.7842224]]
GET: ([5], 2) -> [[1.5472215]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.7842224]]
GET: ([5], 2) -> [[1.5472215]]
GET: ([5], 1) -> [[3.7842224]]
SET: ([5], 1) <- [[4.3436418]]
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.7954106]]
GET: ([5], 2) -> [[1.5472215]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.7954106]]
GET: ([5], 2) -> [[1.5472215]]
GET: ([5], 1) -> [[3.7954106]]
SET: ([5], 1) <- [[4.3538237]]
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.8065789]]
GET: ([5], 2) -> [[1.5472215]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.8065789]]
GET: ([5], 2) -> [[1.5472215]]
GET: ([5], 1) -> [[3.8065789]]
SET: ([5], 1) <- [[4.3639865]]
GET: ([5], 0) -> [[1.7226231]]
GET: ([5], 1) -> [[3.817727]]
GET: ([5], 2) -> [[1.5472215]]
Action 1: ([5]) -> ([5]), reward: 1.0
GET: ([5], 0

In [47]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.9)
policy = q_learning_impl_v2.RandomActionPolicy()

# Train.
for _ in range(400000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy = q_learning_impl_v2.MaxValuePolicy()
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)


GET: ([6], 0) -> [[nan]]
GET: ([6], 1) -> [[nan]]
GET: ([6], 2) -> [[nan]]
Action None: ([6]) -> ([6]), reward: -1.0
GET: ([6], 0) -> [[nan]]
GET: ([6], 1) -> [[nan]]
GET: ([6], 2) -> [[nan]]


TypeError: tuple indices must be integers or slices, not NoneType

In [48]:
%%time

env = CircularWorld()
qfunc = q_learning_impl_v2.MultiModelQFunction(
    env, (), (20, 20, 20), learning_rate=0.9, discount_factor=0.5)
policy = q_learning_impl_v2.MaxValueWithRandomnessPolicy(certainty = 0.95)

# Train.
for _ in range(2000):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

# Test.
env.debug_verbosity = 5
qfunc.debug_verbosity = 5
policy.debug_verbosity = 5
for _ in range(20):
    s = env.GetState()
    a = policy.Decide(qfunc, s, env.GetActionSpace())
    r = env.TakeAction(a)
    s_new = env.GetState()
    qfunc.UpdateWithTransition(s, a, r, s_new)

GET: ([8], 0) -> [[-0.16123891]]
GET: ([8], 1) -> [[-1.2013512]]
GET: ([8], 2) -> [[-0.3959014]]
Action 0: ([8]) -> ([7]), reward: 1.0
GET: ([7], 0) -> [[-0.16123891]]
GET: ([7], 1) -> [[-1.1318822]]
GET: ([7], 2) -> [[-0.3959014]]
GET: ([8], 0) -> [[-0.16123891]]
SET: ([8], 0) <- [[0.8113186]]
GET: ([7], 0) -> [[-0.14178777]]
GET: ([7], 1) -> [[-1.1318822]]
GET: ([7], 2) -> [[-0.3959014]]
Action 0: ([7]) -> ([6]), reward: 1.0
GET: ([6], 0) -> [[-0.14178777]]
GET: ([6], 1) -> [[-1.0624131]]
GET: ([6], 2) -> [[-0.3959014]]
GET: ([7], 0) -> [[-0.14178777]]
SET: ([7], 0) <- [[0.8220167]]
GET: ([6], 0) -> [[-0.12251168]]
GET: ([6], 1) -> [[-1.0624131]]
GET: ([6], 2) -> [[-0.3959014]]
Action 0: ([6]) -> ([5]), reward: 1.0
GET: ([5], 0) -> [[-0.12251168]]
GET: ([5], 1) -> [[-0.99294406]]
GET: ([5], 2) -> [[-0.3959014]]
GET: ([6], 0) -> [[-0.12251168]]
SET: ([6], 0) <- [[0.8326186]]
GET: ([5], 0) -> [[-0.10340907]]
GET: ([5], 1) -> [[-0.99294406]]
GET: ([5], 2) -> [[-0.3959014]]
Action 0: ([5

#### Conclusion

This model works when using model of structure (20, 20, 20) with MaxValueWithRandomnessPolicy. Separating models for different actions is the key. It does not work when:
* A shared layer of size 200 is added.
* When using MaxValuePolicy.