# Config

In [1]:
import pandas as pd
import numpy as np
import time

ACTIONS     = ['up', 'down', 'left', 'right']
LENGTH      = None
N_STATES    = None
START       = None
HOLE        = None
TERMINAL    = None
EPSILON     = None
MAX_EPISODE = None
LAMBDA      = None
ALPHA       = None
FIRST       = True

# Initial Q-Table

In [2]:
def build_q_table():
    global N_STATES
    global ACTIONS
    table = pd.DataFrame(
        np.zeros((N_STATES, len(ACTIONS))),
        columns=ACTIONS
    )
    print(table)
    return table

# build_q_table()

# Actor
- ε-Greedy

In [3]:
def actor(state, q_table):
    state_act = q_table.iloc[state]
    if np.random.uniform() > EPSILON or state_act.all() == 0:
        act = np.random.choice(ACTIONS)
    else:
        act = state_act.argmax()
    return act

# actor(0, build_q_table())

# Enviroment Visual

In [4]:
def update_env(state, episode, step):
    view = np.array([['_ '] * LENGTH] * LENGTH)
    view[tuple(TERMINAL)] = '* '
    view[HOLE] = 'X '
    view[tuple(state)] = 'o '
    interaction = ''
    for v in view:
        interaction += ''.join(v) + '\n'
    message = 'EPISODE: {}, STEP: {}'.format(episode, step) 
    interaction += message
    if state == TERMINAL:
        print(interaction)
#         time.sleep(.1)
    else:
        print(interaction)
#         time.sleep(.1)
        
# for i in range(12):
#     update_env([int(i / 4), i % 4], 1, i)

# Enviroment Feedback

In [5]:
def init_env():
    global HOLE
    global FIRST
    global START
    global TERMINAL
    start = START
    if FIRST:
        f = lambda: np.random.choice(range(LENGTH))
        hole = f(), f()
        while hole == START and hole == TERMINAL:
            hole = f(), f()
        HOLE = hole
        FIRST = False
    return start, False

def get_env_feedback(state, action):
    reward = 0.
    end = False
    a, b = state
    if action == 'up':
        a -= 1
        if a < 0:
            a = 0
        next_state = (a, b)
        if next_state == TERMINAL:
            reward = 1.
            end = True
        elif next_state == HOLE:
            reward = -1.
            end = True
    elif action == 'down':
        a += 1
        if a >= LENGTH:
            a = LENGTH - 1
        next_state = (a, b)
        if next_state == HOLE:
            reward = -1.
            end = True
    elif action == 'left':
        b -= 1
        if b < 0:
            b = 0
        next_state = (a, b)
        if next_state == HOLE:
            reward = -1.
            end = True
    elif action == 'right':
        b += 1
        if b >= LENGTH:
            b = LENGTH - 1
        next_state = (a, b)
        if next_state == TERMINAL:
            reward = 1.
            end = True
        elif next_state == HOLE:
            reward = -1.
            end = True
    return next_state, reward, end


# Run Game

In [6]:
def run():
    q_table = build_q_table()
    episode = 0
    while episode < MAX_EPISODE:
        state, end = init_env()
        step = 0
        update_env(state, episode, step)
        while not end:
            a, b = state
            act = actor(a * LENGTH + b, q_table)
            print(act)
            next_state, reward, end = get_env_feedback(state, act)
            na, nb = next_state
            q_predict = q_table.ix[a * LENGTH + b, act]
            if next_state != TERMINAL:
                q_target = reward + LAMBDA * q_table.iloc[na * LENGTH + nb].max()
            else:
                q_target = reward
            q_table.ix[a * LENGTH + b, act] += ALPHA * (q_target - q_predict)
            state = next_state
            step += 1
            update_env(state, episode, step)
        print()
        print(q_table)
        print()
        episode += 1
    return q_table
        

# Main

In [7]:
if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    
    parser.add_argument('-l',
                        default='4',
                        dest="LENGTH",
                        help='input the length of the grid')

    parser.add_argument('-i',
                        default='20',
                        dest='ITERATION',
                        help='input the iteration of training')
    
    parser.add_argument('-f',
                        default='None',
                        help='ipython')

    args = parser.parse_args()
    
    LENGTH      = int(args.LENGTH)
    N_STATES    = LENGTH * LENGTH
    START       = (LENGTH - 1, 0)
    TERMINAL    = (0, LENGTH - 1)
    EPSILON     = .9
    MAX_EPISODE = int(args.ITERATION)
    LAMBDA      = .9
    ALPHA       = .1

    q_table = run()

     up  down  left  right
0   0.0   0.0   0.0    0.0
1   0.0   0.0   0.0    0.0
2   0.0   0.0   0.0    0.0
3   0.0   0.0   0.0    0.0
4   0.0   0.0   0.0    0.0
5   0.0   0.0   0.0    0.0
6   0.0   0.0   0.0    0.0
7   0.0   0.0   0.0    0.0
8   0.0   0.0   0.0    0.0
9   0.0   0.0   0.0    0.0
10  0.0   0.0   0.0    0.0
11  0.0   0.0   0.0    0.0
12  0.0   0.0   0.0    0.0
13  0.0   0.0   0.0    0.0
14  0.0   0.0   0.0    0.0
15  0.0   0.0   0.0    0.0
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ X 
EPISODE: 0, STEP: 0
left
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ X 
EPISODE: 0, STEP: 1
right
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ o _ X 
EPISODE: 0, STEP: 2
right
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ o X 
EPISODE: 0, STEP: 3
up
_ _ _ * 
_ _ _ _ 
_ _ o _ 
_ _ _ X 
EPISODE: 0, STEP: 4
left
_ _ _ * 
_ _ _ _ 
_ o _ _ 
_ _ _ X 
EPISODE: 0, STEP: 5
down
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ o _ X 
EPISODE: 0, STEP: 6
right
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ o X 
EPISODE: 0, STEP: 7
left
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ o _ X 
EPISODE: 0, STEP

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
  



up
_ _ _ * 
o _ _ _ 
_ _ _ _ 
_ _ _ X 
EPISODE: 9, STEP: 8
left
_ _ _ * 
o _ _ _ 
_ _ _ _ 
_ _ _ X 
EPISODE: 9, STEP: 9
left
_ _ _ * 
o _ _ _ 
_ _ _ _ 
_ _ _ X 
EPISODE: 9, STEP: 10
left
_ _ _ * 
o _ _ _ 
_ _ _ _ 
_ _ _ X 
EPISODE: 9, STEP: 11
left
_ _ _ * 
o _ _ _ 
_ _ _ _ 
_ _ _ X 
EPISODE: 9, STEP: 12
right
_ _ _ * 
_ o _ _ 
_ _ _ _ 
_ _ _ X 
EPISODE: 9, STEP: 13
down
_ _ _ * 
_ _ _ _ 
_ o _ _ 
_ _ _ X 
EPISODE: 9, STEP: 14
left
_ _ _ * 
_ _ _ _ 
o _ _ _ 
_ _ _ X 
EPISODE: 9, STEP: 15
down
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ X 
EPISODE: 9, STEP: 16
down
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ X 
EPISODE: 9, STEP: 17
right
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ o _ X 
EPISODE: 9, STEP: 18
right
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ _ o X 
EPISODE: 9, STEP: 19
left
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
_ o _ X 
EPISODE: 9, STEP: 20
left
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ X 
EPISODE: 9, STEP: 21
left
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ X 
EPISODE: 9, STEP: 22
down
_ _ _ * 
_ _ _ _ 
_ _ _ _ 
o _ _ X 
EPISODE: 9, STEP: 23
left
_ _