In [31]:
ReloadProject('deep_learning')

notebook_init.py imported and reloaded
forwarded symbol: Activation
forwarded symbol: Dense
forwarded symbol: Dict
forwarded symbol: InputLayer
forwarded symbol: List
forwarded symbol: Model
forwarded symbol: Sequential
forwarded symbol: Tuple
reloaded: gym
forwarded symbol: gym
reloaded: keras
forwarded symbol: keras
reloaded: q_learning
forwarded symbol: q_learning
reloaded: q_learning_impl
forwarded symbol: q_learning_impl


## Memoization based learning
### Circular World
Let's assume a world with 11 states: 0-10. Each time the agent and move +1 or -1, with 0-1 -> 10 and 10+1 -> 0. All actions that gets the agent closer to state "5" gets reward +1, otherwise gets reward -1.

In [56]:
TARGET_STATE = 5

class IntState(q_learning_impl.HashableState):
    
    def __init__(self, state: int):
        self.value = state
        
    def __hash__(self):
        return hash(self.value)
        
    def __str__(self):
        return '%d' % self.value


class IntAction(q_learning_impl.HashableAction):
    
    def __init__(self, action: int):
        self.value = action
        
    def __hash__(self):
        return hash(self.value)
        
    def __str__(self):
        return '%d' % self.value


class CircularWorld(q_learning.Environment):
    
    def __init__(self):
        super().__init__()
        self._current_state = IntState(0)
        self._last_action = IntAction(0)
        self._last_reward = 0.0
        
        self._action_plus = IntAction(1)
        self._action_zero = IntAction(0)
        self._action_minus = IntAction(-1)
        
    def Print(self):
        print('At: %s (last action: %s; last reward: %s)' % (
            self._current_state, self._last_action, self._last_reward))
    
    #@ Override
    def GetActionSpace(self) -> List[IntAction]:
        return [self._action_plus, self._action_zero, self._action_minus]
        
    #@ Override
    def TakeAction(self, action: IntAction) -> None:
        if action == self._action_plus:
            if self._current_state.value < TARGET_STATE:
                self._last_reward = 1.0
            else:
                self._last_reward = -1.0
            
            new_state = self._current_state.value + 1
            if new_state == 11:
                new_state = 0
            self._current_state = IntState(new_state)
        elif action == self._action_minus:
            if self._current_state.value > TARGET_STATE:
                self._last_reward = 1.0
            else:
                self._last_reward = -1.0
            
            new_state = self._current_state.value - 1
            if new_state == -1:
                new_state = 10
            self._current_state = IntState(new_state)
        else:
            if self._current_state.value != TARGET_STATE:
                self._last_reward = -0.5
            else:
                self._last_reward = 1.0
        self._last_action = action

Let's try out the environment.

In [57]:
env = CircularWorld()
for _ in range(20):
    env.Print()
    env.TakeAction(np.random.choice(env.GetActionSpace()))

At: 0 (last action: 0; last reward: 0.0)
At: 10 (last action: -1; last reward: -1.0)
At: 9 (last action: -1; last reward: 1.0)
At: 8 (last action: -1; last reward: 1.0)
At: 9 (last action: 1; last reward: -1.0)
At: 9 (last action: 0; last reward: -0.5)
At: 8 (last action: -1; last reward: 1.0)
At: 9 (last action: 1; last reward: -1.0)
At: 10 (last action: 1; last reward: -1.0)
At: 10 (last action: 0; last reward: -0.5)
At: 0 (last action: 1; last reward: -1.0)
At: 0 (last action: 0; last reward: -0.5)
At: 1 (last action: 1; last reward: 1.0)
At: 1 (last action: 0; last reward: -0.5)
At: 0 (last action: -1; last reward: -1.0)
At: 0 (last action: 0; last reward: -0.5)
At: 0 (last action: 0; last reward: -0.5)
At: 10 (last action: -1; last reward: -1.0)
At: 10 (last action: 0; last reward: -0.5)
At: 0 (last action: 1; last reward: -1.0)


### Learning

In [63]:
%%time

env = CircularWorld()
qfunc = q_learning_impl.FiniteStateQFunction()
qfunc.SetLearningRate(0.9)
qfunc.SetDiscountFactor(0.9)
max_policy = q_learning_impl.MaxValueWithRandomnessPolicy()

for _ in range(20000):
#     env.Print()
#     qfunc.Print()
    s = env.GetCurrentState()
    a = max_policy.Decide(qfunc, s, env.GetActionSpace())
    env.TakeAction(a)
    s_new = env.GetCurrentState()
    qfunc.UpdateWithTransition(s, a, env.GetLastReward(), s_new, env.GetActionSpace())
    
for _ in range(10):
    env.Print()
#     qfunc.Print()
    s = env.GetCurrentState()
    a = max_policy.Decide(qfunc, s, env.GetActionSpace())
    env.TakeAction(a)
    s_new = env.GetCurrentState()
    qfunc.UpdateWithTransition(s, a, env.GetLastReward(), s_new, env.GetActionSpace())
    
# qfunc.Print()

At: 6 (last action: 1; last reward: -1.0)
At: 5 (last action: -1; last reward: 1.0)
At: 4 (last action: -1; last reward: -1.0)
At: 4 (last action: 0; last reward: -0.5)
At: 3 (last action: -1; last reward: -1.0)
At: 4 (last action: 1; last reward: 1.0)
At: 5 (last action: 1; last reward: 1.0)
At: 4 (last action: -1; last reward: -1.0)
At: 3 (last action: -1; last reward: -1.0)
At: 4 (last action: 1; last reward: 1.0)
CPU times: user 690 ms, sys: 0 ns, total: 690 ms
Wall time: 663 ms


In [64]:
for state in range(10):
    for action in (-1, 1, 0):
        print('(%d, %d): %s' % (state, action, qfunc.GetValue(IntState(state), IntAction(action))))

(0, -1): 7.999999999999995
(0, 1): 9.999999999999993
(0, 0): 8.499999999999995
(1, -1): 7.999999999999995
(1, 1): 9.999999999999993
(1, 0): 8.499999999999995
(2, -1): 7.999999999999995
(2, 1): 9.999999999999993
(2, 0): 8.499999999999995
(3, -1): 7.999999999999995
(3, 1): 9.999999999999993
(3, 0): 8.499999999999995
(4, -1): 7.999999999999995
(4, 1): 9.999999999999993
(4, 0): 8.499999999999995
(5, -1): 7.999999999999995
(5, 1): 7.999999999999995
(5, 0): 9.999999999999993
(6, -1): 9.999999999999993
(6, 1): 7.999999999999995
(6, 0): 8.499999999999995
(7, -1): 9.999999999999993
(7, 1): 7.999999999999995
(7, 0): 8.499999999999995
(8, -1): 9.999999999999993
(8, 1): 7.999999999999995
(8, 0): 8.499999999999995
(9, -1): 9.999999999999993
(9, 1): 7.999999999999995
(9, 0): 8.499999999999995


## Deep learning
First we need to modify the environment to generte state and action with numpy arrays.

In [33]:
TARGET_STATE = 5
ZERO_INT = np.zeros(1, dtype=int)


class NpCircularWorld(q_learning.Environment):
    
    def __init__(self):
        super().__init__()
        
        self._current_state = ZERO_INT
        self._last_action = ZERO_INT
        self._last_reward = 0.0
        
        self._action_zero = ZERO_INT
        self._action_plus = ZERO_INT + 1
        self._action_minus = ZERO_INT - 1
        
    def Print(self):
        print('At: %s (last action: %s; last reward: %s)' % (
            self._current_state, self._last_action, self._last_reward))
    
    #@ Override
    def GetActionSpace(self):
        return [self._action_plus, self._action_zero, self._action_minus]
        
    #@ Override
    def TakeAction(self, action) -> None:
        if action == self._action_plus:
            if self._current_state < TARGET_STATE:
                self._last_reward = 1.0
            else:
                self._last_reward = -1.0
            
            new_state = self._current_state + 1
            if new_state == ZERO_INT + 11:
                new_state = ZERO_INT
            self._current_state = new_state
        elif action == self._action_minus:
            if self._current_state > TARGET_STATE:
                self._last_reward = 1.0
            else:
                self._last_reward = -1.0
            
            new_state = self._current_state - 1
            if new_state == -1:
                new_state = ZERO_INT + 10
            self._current_state = new_state
        else:
            if self._current_state != TARGET_STATE:
                self._last_reward = -1.0
            else:
                self._last_reward = 0.0
        self._last_action = action

In [34]:
# Build a non-linear model.
model = Sequential()
model.add(Dense(20, activation='relu', input_dim=2))
model.add(Dense(6, activation='relu'))
model.add(Dense(6, activation='relu'))
model.add(Dense(6, activation='relu'))
model.add(Dense(6, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='sgd', loss='mse')

In [35]:
%%time

env = NpCircularWorld()
qfunc = q_learning_impl.KerasModelQFunction(model)
qfunc.SetLearningRate(0.9)
qfunc.SetDiscountFactor(0.9)
max_policy = q_learning_impl.MaxValueWithRandomnessPolicy()

# Training
for _ in range(1000):
    s = env.GetCurrentState()
    a = max_policy.Decide(qfunc, s, env.GetActionSpace())
    env.TakeAction(a)
    s_new = env.GetCurrentState()
    qfunc.UpdateWithTransition(s, a, env.GetLastReward(), s_new, env.GetActionSpace())

    
# Testing
qfunc.debug = True
for _ in range(20):
    env.Print()
    s = env.GetCurrentState()
    a = max_policy.Decide(qfunc, s, env.GetActionSpace())
    env.TakeAction(a)
    s_new = env.GetCurrentState()
    qfunc.UpdateWithTransition(s, a, env.GetLastReward(), s_new, env.GetActionSpace())

At: [7] (last action: [0]; last reward: -1.0)
GET: ([7], [1]) -> [[19.786322]]
GET: ([7], [0]) -> [[19.786322]]
GET: ([7], [-1]) -> [[19.786322]]
GET: ([7], [1]) -> [[19.786322]]
GET: ([7], [0]) -> [[19.786322]]
GET: ([7], [-1]) -> [[19.786322]]
GET: ([7], [0]) -> [[19.786322]]
At: [7] (last action: [0]; last reward: -1.0)
GET: ([7], [1]) -> [[19.732706]]
GET: ([7], [0]) -> [[19.732706]]
GET: ([7], [-1]) -> [[19.732706]]
GET: ([8], [1]) -> [[19.732706]]
GET: ([8], [0]) -> [[19.732706]]
GET: ([8], [-1]) -> [[19.732706]]
GET: ([7], [1]) -> [[19.732706]]
At: [8] (last action: [1]; last reward: -1.0)
GET: ([8], [1]) -> [[19.679188]]
GET: ([8], [0]) -> [[19.679188]]
GET: ([8], [-1]) -> [[19.679188]]
GET: ([8], [1]) -> [[19.679188]]
GET: ([8], [0]) -> [[19.679188]]
GET: ([8], [-1]) -> [[19.679188]]
GET: ([8], [0]) -> [[19.679188]]
At: [8] (last action: [0]; last reward: -1.0)
GET: ([8], [1]) -> [[19.625765]]
GET: ([8], [0]) -> [[19.625765]]
GET: ([8], [-1]) -> [[19.625765]]
GET: ([8], [1]) -

Pretty good!

In [21]:
np.random.choice([ZERO_INT+1, ZERO_INT, ZERO_INT-1])

ValueError: a must be 1-dimensional

In [30]:
np.random.choice([ZERO_INT+1, ZERO_INT, ZERO_INT-1])

ValueError: a must be 1-dimensional

In [None]:
np.array()