# Config

In [1]:
import pandas as pd
import numpy as np
import time

ACTIONS = ['up', 'down', 'left', 'right']
LENGTH = 4
N_STATES = LENGTH * LENGTH
TERMINAL = (0, 3)
EPSILON = .9
MAX_EPISODE = 20
LAMBDA = .9
ALPHA = .1

# Initial Q-Table

In [2]:
def build_q_table():
    global N_STATES
    global ACTIONS
    table = pd.DataFrame(
        np.zeros((N_STATES, len(ACTIONS))),
        columns=ACTIONS
    )
    print(table)
    return table

build_q_table()

     up  down  left  right
0   0.0   0.0   0.0    0.0
1   0.0   0.0   0.0    0.0
2   0.0   0.0   0.0    0.0
3   0.0   0.0   0.0    0.0
4   0.0   0.0   0.0    0.0
5   0.0   0.0   0.0    0.0
6   0.0   0.0   0.0    0.0
7   0.0   0.0   0.0    0.0
8   0.0   0.0   0.0    0.0
9   0.0   0.0   0.0    0.0
10  0.0   0.0   0.0    0.0
11  0.0   0.0   0.0    0.0
12  0.0   0.0   0.0    0.0
13  0.0   0.0   0.0    0.0
14  0.0   0.0   0.0    0.0
15  0.0   0.0   0.0    0.0


Unnamed: 0,up,down,left,right
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0


# Actor
- ε-Greedy

In [3]:
def actor(state, q_table):
    state_act = q_table.iloc[state]
    if np.random.uniform() > EPSILON or state_act.all() == 0:
        act = np.random.choice(ACTIONS)
    else:
        act = state_act.argmax()
    return act

actor(0, build_q_table())

     up  down  left  right
0   0.0   0.0   0.0    0.0
1   0.0   0.0   0.0    0.0
2   0.0   0.0   0.0    0.0
3   0.0   0.0   0.0    0.0
4   0.0   0.0   0.0    0.0
5   0.0   0.0   0.0    0.0
6   0.0   0.0   0.0    0.0
7   0.0   0.0   0.0    0.0
8   0.0   0.0   0.0    0.0
9   0.0   0.0   0.0    0.0
10  0.0   0.0   0.0    0.0
11  0.0   0.0   0.0    0.0
12  0.0   0.0   0.0    0.0
13  0.0   0.0   0.0    0.0
14  0.0   0.0   0.0    0.0
15  0.0   0.0   0.0    0.0


'down'

# Enviroment Visual

In [4]:
def update_env(state, episode, step):
    view = np.array([['_ '] * LENGTH] * LENGTH)
    view[tuple(TERMINAL)] = '* '
    view[tuple(state)] = 'o '
    interaction = ''
    for v in view:
        interaction += ''.join(v) + '\n'
    message = 'EPISODE: {}, STEP: {}'.format(episode, step) 
    interaction += message
    if state == TERMINAL:
        print(interaction)
#         time.sleep(.5)
    else:
        print(interaction)
#         time.sleep(.5)
        
# for i in range(12):
#     update_env([int(i / 4), i % 4], 1, i)

# Enviroment Feedback

In [5]:
def get_env_feedback(state, action):
    reward = 0.
    a, b = state
    if action == 'up':
        a -= 1
        if a < 0:
            a = 0
        next_state = (a, b)
        if next_state == TERMINAL:
            reward = 1.
    elif action == 'down':
        a += 1
        if a >= LENGTH:
            a = LENGTH - 1
        next_state = (a, b)
    elif action == 'left':
        b -= 1
        if b < 0:
            b = 0
        next_state = (a, b)
    elif action == 'right':
        b += 1
        if b >= LENGTH:
            b = LENGTH - 1
        next_state = (a, b)
        if next_state == TERMINAL:
            reward = 1.
    return next_state, reward

# Run Game

In [6]:
def run():
    q_table = build_q_table()
    episode = 0
    while episode < MAX_EPISODE:
        state = (3, 0)
        step = 0
        update_env(state, episode, step)
        while state != TERMINAL:
            a, b = state
            act = actor(a * LENGTH + b, q_table)
            print(act)
            next_state, reward = get_env_feedback(state, act)
            na, nb = next_state
            q_predict = q_table.ix[a * LENGTH + b, act]
            if next_state != TERMINAL:
                q_target = reward + LAMBDA * q_table.iloc[na * LENGTH + nb].max()
            else:
                q_target = reward
            q_table.ix[a * LENGTH + b, act] += ALPHA * (q_target - q_predict)
            state = next_state
            step += 1
            update_env(state, episode, step)
        print()
        print(q_table)
        print()
        episode += 1
    return q_table
        

# Main

In [7]:
if __name__ == '__main__':
    q_table = run()

     up  down  left  right
0   0.0   0.0   0.0    0.0
1   0.0   0.0   0.0    0.0
2   0.0   0.0   0.0    0.0
3   0.0   0.0   0.0    0.0
4   0.0   0.0   0.0    0.0
5   0.0   0.0   0.0    0.0
6   0.0   0.0   0.0    0.0
7   0.0   0.0   0.0    0.0
8   0.0   0.0   0.0    0.0
9   0.0   0.0   0.0    0.0
10  0.0   0.0   0.0    0.0
11  0.0   0.0   0.0    0.0
12  0.0   0.0   0.0    0.0
13  0.0   0.0   0.0    0.0
14  0.0   0.0   0.0    0.0
15  0.0   0.0   0.0    0.0
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ _ 
EPISODE: 0, STEP: 0
up
_ _ _ * 
_ _ _ _ 
o _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 1
right
_ _ _ * 
_ _ _ _ 
_ o _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 2
down
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ o _ _ 
EPISODE: 0, STEP: 3
left
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ _ 
EPISODE: 0, STEP: 4
up
_ _ _ * 
_ _ _ _ 
o _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 5
left
_ _ _ * 
_ _ _ _ 
o _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 6
up
_ _ _ * 
o _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 7
up
o _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 0, STEP: 8
up
o

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  



_ _ _ * 
_ _ _ _ 
_ _ o _ 
_ _ _ _ 
EPISODE: 2, STEP: 77
up
_ _ _ * 
_ _ o _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 78
left
_ _ _ * 
_ o _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 79
right
_ _ _ * 
_ _ o _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 80
up
_ _ o * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 81
up
_ _ o * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 82
down
_ _ _ * 
_ _ o _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 83
up
_ _ o * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 84
left
_ o _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 85
left
o _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 86
right
_ o _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 87
left
o _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 88
up
o _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 89
down
_ _ _ * 
o _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 90
left
_ _ _ * 
o _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 91
left
_ _ _ * 
o _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 2, STEP: 92
up
o _ _ * 
_ _ _ _ 

  




_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ _ 
EPISODE: 9, STEP: 0
right
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ o _ _ 
EPISODE: 9, STEP: 1
right
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ o _ 
EPISODE: 9, STEP: 2
up
_ _ _ * 
_ _ _ _ 
_ _ o _ 
_ _ _ _ 
EPISODE: 9, STEP: 3
up
_ _ _ * 
_ _ o _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 9, STEP: 4
up
_ _ o * 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 9, STEP: 5
right
_ _ _ o 
_ _ _ _ 
_ _ _ _ 
_ _ _ _ 
EPISODE: 9, STEP: 6

              up          down          left     right
0   0.000000e+00  0.000000e+00  0.000000e+00  0.000810
1   8.100000e-04  0.000000e+00  0.000000e+00  0.017100
2   9.000000e-03  8.100000e-04  8.100000e-04  0.521703
3   0.000000e+00  0.000000e+00  0.000000e+00  0.000000
4   0.000000e+00  0.000000e+00  0.000000e+00  0.000000
5   0.000000e+00  1.246590e-05  0.000000e+00  0.000000
6   1.395079e-01  1.385100e-04  1.121931e-06  0.017100
7   2.710000e-01  8.100000e-04  1.539000e-03  0.017100
8   0.000000e+00  6.202576e-06  0.000000e+00  0.000000
9   0.000000e+00  0.000000e+