# Reinforcement Learning


In [1]:
#import packages here
import numpy as np
import pandas as pd
import time

N_STATES = 6   # the width of 1-dim world
ACTIONS = ['left', 'right']     # the available actions to use
EPSILON = 0.9   # the degree of greedy (0＜ε＜1)
ALPHA = 0.1     # learning rate (0＜α≤1)
GAMMA = 0.9    # discount factor (0＜γ＜1)
MAX_EPOCHES = 13   # the max epoches
FRESH_TIME = 0.3    # the interval time

In [19]:
#define the function here
def build_q_table(n_states, actions):
    df = pd.DataFrame(data=0.0, index=range(n_states), columns=actions)
    return df

q_table = build_q_table(N_STATES, ACTIONS)
print(q_table)

   left  right
0   0.0    0.0
1   0.0    0.0
2   0.0    0.0
3   0.0    0.0
4   0.0    0.0
5   0.0    0.0


In [175]:
#define the function here
# Given state and Q-table, choose action
def choose_action(state, q_table):
    # pick all actions from this state
    rand = np.random.random()
    #print(rand)
    if (q_table.loc[state] == 0).all() or rand>EPSILON:  # non-greedy or non-explored
        action_name = np.random.choice(q_table.columns)
    else:
        action_name = q_table.loc[state].idxmax()
    return action_name

sample_action = choose_action(0, q_table)
print(sample_action)

left


In [43]:
#define the function here
def get_env_feedback(S_current, A):
    # This is how agent will interact with the environment
    if A == 'right':    # move right
        if S_current == 4:
            S_next = 'terminal'
            R = 1
        else:
            S_next = S_current+1
            R = 0
    else:   # move left
        if S_current == 0:
            S_next = 0
            R = 0
        else:
            S_next = S_current-1
            R = 0
    return S_next, R

sample_action = 'left'
S_current = 4
sample_feedback = get_env_feedback(S_current, sample_action)
print(sample_feedback)

(3, 0)


In [45]:
def update_env(S, episode, step_counter):
#     # This is how environment be updated
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
    if S == 'terminal':
        interaction = '  Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('{}\n'.format(interaction), end='')
        time.sleep(2)
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)

In [181]:
#define the function here
def reinforce_learning():
    
#     # main part of RL loop
    q_table = build_q_table(N_STATES, ACTIONS)
#     ...
#     #start training loop
    for episode in range(MAX_EPOCHES):       
        step_counter = 0  #counter for counting steps to reach the treasure
        S_current = 0    #start from S_current
        is_terminated = False   #flag to conrinue or stop the loop
        update_env(S_current, episode, step_counter)   #update environment
        while not is_terminated:
            A = choose_action(S_current, q_table)
            S_next, reward = get_env_feedback(S_current, A)
            
            ...#update Q-table
            # Treasure not found yet
            if S_next != 'terminal':  
                # Use the bellman equation to calc value
                q_target = reward+GAMMA * q_table.loc[S_next, :].max()  
            else:
                # Next is treasure so set value to max
                q_target = 1  
                is_terminated = True  

            q_table.loc[S_current, A] += ALPHA * (q_target - q_table.loc[S_current, A])
            ...  # move to next state
            S_current = S_next

            update_env(S_current, episode, step_counter+1)
            step_counter += 1

    return q_table


In [183]:
#main function to run
if __name__ == "__main__":
    q_table = reinforce_learning()
    print('\r\nQ-table:\n')
    print(q_table)

----oT  Episode 1: total_steps = 5
----oT  Episode 2: total_steps = 25
----oT  Episode 3: total_steps = 34
----oT  Episode 4: total_steps = 11
----oT  Episode 5: total_steps = 6
----oT  Episode 6: total_steps = 5
----oT  Episode 7: total_steps = 5
----oT  Episode 8: total_steps = 7
----oT  Episode 9: total_steps = 5
----oT  Episode 10: total_steps = 7
----oT  Episode 11: total_steps = 5
----oT  Episode 12: total_steps = 5
----oT  Episode 13: total_steps = 5

Q-table:

       left     right
0  0.000000  0.004438
1  0.000000  0.027315
2  0.000000  0.123671
3  0.000000  0.379680
4  0.037197  0.745813
5  0.000000  0.000000
