# Config

In [1]:
import pandas as pd
import numpy as np
import time

ACTIONS = ['up', 'down', 'left', 'right']
LENGTH = 4
N_STATES = LENGTH * LENGTH
TERMINAL = (0, 3)
EPSILON = .9
MAX_EPISODE = 20
LAMBDA = .9
ALPHA = .1

# Initial Q-Table

In [2]:
def build_q_table():
    global N_STATES
    global ACTIONS
    table = pd.DataFrame(
        np.zeros((N_STATES, len(ACTIONS))),
        columns=ACTIONS
    )
    print(table)
    return table

build_q_table()

     up  down  left  right
0   0.0   0.0   0.0    0.0
1   0.0   0.0   0.0    0.0
2   0.0   0.0   0.0    0.0
3   0.0   0.0   0.0    0.0
4   0.0   0.0   0.0    0.0
5   0.0   0.0   0.0    0.0
6   0.0   0.0   0.0    0.0
7   0.0   0.0   0.0    0.0
8   0.0   0.0   0.0    0.0
9   0.0   0.0   0.0    0.0
10  0.0   0.0   0.0    0.0
11  0.0   0.0   0.0    0.0
12  0.0   0.0   0.0    0.0
13  0.0   0.0   0.0    0.0
14  0.0   0.0   0.0    0.0
15  0.0   0.0   0.0    0.0


Unnamed: 0,up,down,left,right
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0


# Actor
- ε-Greedy

In [3]:
def actor(state, q_table):
    state_act = q_table.iloc[state]
    if np.random.uniform() > EPSILON or state_act.all() == 0:
        act = np.random.choice(ACTIONS)
    else:
        act = state_act.argmax()
    return act

actor(0, build_q_table())

     up  down  left  right
0   0.0   0.0   0.0    0.0
1   0.0   0.0   0.0    0.0
2   0.0   0.0   0.0    0.0
3   0.0   0.0   0.0    0.0
4   0.0   0.0   0.0    0.0
5   0.0   0.0   0.0    0.0
6   0.0   0.0   0.0    0.0
7   0.0   0.0   0.0    0.0
8   0.0   0.0   0.0    0.0
9   0.0   0.0   0.0    0.0
10  0.0   0.0   0.0    0.0
11  0.0   0.0   0.0    0.0
12  0.0   0.0   0.0    0.0
13  0.0   0.0   0.0    0.0
14  0.0   0.0   0.0    0.0
15  0.0   0.0   0.0    0.0


'up'

# Enviroment Visual

In [4]:
def update_env(state, episode, step):
    view = np.array([['_ '] * LENGTH] * LENGTH)
    view[tuple(TERMINAL)] = '* '
    view[tuple(state)] = 'o '
    interaction = ''
    for v in view:
        interaction += ''.join(v) + '\n'
    message = 'EPISODE: {}, STEP: {}'.format(episode, step) 
    interaction += message
    if state == TERMINAL:
        print(interaction)
#         time.sleep(.5)
    else:
        print(interaction)
#         time.sleep(.5)
        
# for i in range(12):
#     update_env([int(i / 4), i % 4], 1, i)

# Enviroment Feedback

In [5]:
def get_env_feedback(state, action):
    reward = 0.
    a, b = state
    if action == 'up':
        a -= 1
        if a < 0:
            a = 0
        next_state = (a, b)
        if next_state == TERMINAL:
            reward = 1.
    elif action == 'down':
        a += 1
        if a >= LENGTH:
            a = LENGTH - 1
        next_state = (a, b)
    elif action == 'left':
        b -= 1
        if b < 0:
            b = 0
        next_state = (a, b)
    elif action == 'right':
        b += 1
        if b >= LENGTH:
            b = LENGTH - 1
        next_state = (a, b)
        if next_state == TERMINAL:
            reward = 1.
    return next_state, reward

# Run Game

In [6]:
def run():
    q_table = build_q_table()
    episode = 0
    while episode < MAX_EPISODE:
        state = (3, 0)
        step = 0
        update_env(state, episode, step)
        while state != TERMINAL:
            a, b = state
            act = actor(a * LENGTH + b, q_table)
            print(act)
            next_state, reward = get_env_feedback(state, act)
            na, nb = next_state
            q_predict = q_table.ix[a * LENGTH + b, act]
            if next_state != TERMINAL:
                q_target = reward + LAMBDA * q_table.iloc[na * LENGTH + nb].max()
            else:
                q_target = reward
            q_table.ix[a * LENGTH + b, act] += ALPHA * (q_target - q_predict)
            state = next_state
            step += 1
            update_env(state, episode, step)
        print()
        print(q_table)
        print()
        episode += 1
    return q_table
        

# Main

In [7]:
if __name__ == '__main__':
    q_table = run()

     up  down  left  right
0   0.0   0.0   0.0    0.0
1   0.0   0.0   0.0    0.0
2   0.0   0.0   0.0    0.0
3   0.0   0.0   0.0    0.0
4   0.0   0.0   0.0    0.0
5   0.0   0.0   0.0    0.0
6   0.0   0.0   0.0    0.0
7   0.0   0.0   0.0    0.0
8   0.0   0.0   0.0    0.0
9   0.0   0.0   0.0    0.0
10  0.0   0.0   0.0    0.0
11  0.0   0.0   0.0    0.0
12  0.0   0.0   0.0    0.0
13  0.0   0.0   0.0    0.0
14  0.0   0.0   0.0    0.0
15  0.0   0.0   0.0    0.0
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ _ 
EPISODE: 0, STEP: 0
down
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ _ 
EPISODE: 0, STEP: 1
up
_ _ _ * 
_ _ _ _ 
o _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 2
left
_ _ _ * 
_ _ _ _ 
o _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 3
up
_ _ _ * 
o _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 4
up
o _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 5
left
o _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 6
left
o _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 7
right
_ o _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 8
do

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


_ _ _ * 
_ _ _ _ 
_ _ _ o 
_ _ _ _ 
EPISODE: 2, STEP: 23
left
_ _ _ * 
_ _ _ _ 
_ _ o _ 
_ _ _ _ 
EPISODE: 2, STEP: 24
left
_ _ _ * 
_ _ _ _ 
_ o _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 25
up
_ _ _ * 
_ o _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 26
right
_ _ _ * 
_ _ o _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 27
down
_ _ _ * 
_ _ _ _ 
_ _ o _ 
_ _ _ _ 
EPISODE: 2, STEP: 28
up
_ _ _ * 
_ _ o _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 29
up
_ _ o * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 30
down
_ _ _ * 
_ _ o _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 31
up
_ _ o * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 32
up
_ _ o * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 33
up
_ _ o * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 34
left
_ o _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 35
up
_ o _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 36
up
_ o _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 37
left
o _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 38
up
o _ _ * 
_ _ _ _ 
_ _ _ _

_ _ _ * 
_ _ o _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 8, STEP: 4
right
_ _ _ * 
_ _ _ o 
_ _ _ _ 
_ _ _ _ 
EPISODE: 8, STEP: 5
up
_ _ _ o 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 8, STEP: 6

          up          down          left         right
0   0.000002  6.561000e-06  2.131669e-06  4.287023e-05
1   0.000091  6.541390e-04  5.867699e-06  3.249000e-02
2   0.039051  8.100000e-04  8.100000e-04  3.439000e-01
3   0.000000  0.000000e+00  0.000000e+00  0.000000e+00
4   0.000002  5.904900e-07  1.246590e-05  2.697300e-04
5   0.001654  3.490452e-05  6.561000e-06  2.408697e-02
6   0.039780  7.290000e-05  1.712245e-03  1.064405e-01
7   0.409510  2.195100e-03  8.580411e-03  3.249000e-02
8   0.000012  0.000000e+00  5.904900e-07  2.681199e-04
9   0.004335  1.254579e-05  5.904900e-07  1.385100e-04
10  0.000810  3.431403e-05  2.203318e-04  2.195100e-03
11  0.046341  0.000000e+00  7.290000e-05  2.195100e-03
12  0.000001  1.816146e-06  1.396565e-06  2.839827e-05
13  0.000230  4.480638e-06  9.121653e-07  2.827266e-

  




          up          down          left         right
0   0.000002  6.561000e-06  2.131669e-06  4.287023e-05
1   0.000091  6.541390e-04  5.867699e-06  3.249000e-02
2   0.039051  8.100000e-04  8.100000e-04  3.439000e-01
3   0.000000  0.000000e+00  0.000000e+00  0.000000e+00
4   0.000002  5.904900e-07  1.246590e-05  2.697300e-04
5   0.001654  3.490452e-05  6.561000e-06  3.125792e-02
6   0.039780  7.290000e-05  1.712245e-03  1.326523e-01
7   0.468559  2.195100e-03  8.580411e-03  3.249000e-02
8   0.000012  0.000000e+00  5.904900e-07  2.681199e-04
9   0.006069  1.254579e-05  5.904900e-07  1.385100e-04
10  0.000810  3.431403e-05  2.203318e-04  2.195100e-03
11  0.046341  0.000000e+00  7.290000e-05  2.195100e-03
12  0.000001  1.816146e-06  1.396565e-06  4.627511e-05
13  0.000597  4.480638e-06  9.121653e-07  2.827266e-07
14  0.000251  0.000000e+00  3.141407e-06  0.000000e+00
15  0.002195  0.000000e+00  0.000000e+00  0.000000e+00

_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ _ 
EPISODE: 10, STEP: 0
right