# Q Learning Demo

In [1]:
import numpy as np
import pandas as pd
import time

np.random.seed(2)

N_STATES=6                  # The length of 1 dimensional world
ACTIONS=['left','right']    # Available actions
EPSILON=0.9                 # Greedy policy 
ALPHA=0.1                   # Learning rate
LAMBDA=0.9                  # Discount factor
MAX_EPISODES=13             # Maximum episodes
FRESH_TIME=0.1             # Fresh time for one move

In [2]:
def build_q_table(n_states, actions): # Create a Q table
    table=pd.DataFrame(np.zeros((n_states,len(actions))),columns=actions)
    return table

def choose_action(state,q_table):
    state_actions=q_table.iloc[state,:]
    if (np.random.uniform()>EPSILON) or (state_actions.all()==0):   # Act non greedy
        action_name=np.random.choice(ACTIONS)
    else:                                                           # Act greedy
        action_name=state_actions.argmax()
    return action_name

def get_env_feedback(S,A):
    # This is how agent will interact with the environment
    if A=='right':
        if S==N_STATES-2:
            S_='terminal'
            R=1
        else:
            S_=S+1
            R=0
    else:
        R=0
        if S ==0:
            S_=S
        else:
            S_=S-1
    return S_,R

def update_env(S,episode,step_counter):
    env_list=['-']*(N_STATES-1)+['T'] # '-----T' our env
    if S=='terminal':
        interaction='Episode %s: total_steps = %s' % (episode+1,step_counter)
        print('\r{}'.format(interaction),end='')
        time.sleep(2)
        print('\r                                                    ',end='')
    else:
        env_list[S]='o'
        interaction=''.join(env_list)
        print('\r{}'.format(interaction),end='')
        time.sleep(FRESH_TIME)

In [3]:
q_table=build_q_table(N_STATES,ACTIONS)

for episode in range(MAX_EPISODES):
    step_counter=0
    S=0
    is_terminated=False
    update_env(S,episode,step_counter)
    while not is_terminated:
        A=choose_action(S,q_table)
        
        if (A==1): A='right'
        elif (A==0): A='left'

        S_,R=get_env_feedback(S,A)  # Take action & get next state and reward
        q_predict=q_table.loc[S,A]
        if S_!='terminal':
            q_target=R+LAMBDA*q_table.iloc[S_,:].max()
        else:
            q_target=R
            is_terminated=True

        if (A==1): A='right'
        elif (A==0): A='left'
        else: pass

        q_table.loc[S,A]+=ALPHA*(q_target-q_predict)
        S=S_

        update_env(S,episode,step_counter+1)
        step_counter+=1

                                                    

In [4]:
q_table

Unnamed: 0,left,right
0,1e-06,0.005728
1,0.000271,0.032612
2,0.002454,0.111724
3,7.3e-05,0.343331
4,0.00081,0.745813
5,0.0,0.0
