In [1]:
#conda list #conda install pandas

In [2]:
"""
A simple example for Reinforcement Learning using table lookup Q-learning method.
An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
Run this program and to see how the agent will improve its strategy of finding the treasure.

View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""

import numpy as np
import pandas as pd
import time

In [3]:
np.random.seed(2)  # reproducible
# https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html
#貌似是random.choice因为这个的存在变得reproducible，但是random.uniform还是会正常随机产生

# 预设值
N_STATES = 6   # the length of the 1 dimensional world 宽度
ACTIONS = ['left', 'right']     # available actions
EPSILON = 0.9   # greedy 贪婪度 （初期探索阶段，随机更好，不要太greedy）
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 13   # maximum episodes
FRESH_TIME = 0.2    # fresh time for one move 移动间隔时间

In [4]:
# define q-table
def build_q_table(n_states, actions):
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))),     # init q-table will 0s (a 6x2 table)
        columns=actions,    # actions' name
    )
    # print(table)    # show table
    return table

In [5]:
# define how to choose action
def choose_action(state, q_table):
    state_actions = q_table.iloc[state, :]
    # iloc function returns a view of the selected rows and columns from a Pandas DataFrame
    if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  # act non-greedy or state-action have no value
        action_name = np.random.choice(ACTIONS)
    else:   # act greedy
        action_name = state_actions.idxmax()    # replace argmax to idxmax as argmax means a different function in newer version of pandas
    return action_name

关于pandas iloc function：
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html

关于pandas loc function：
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html

关于np.random.uniform及其他常见随机数产生方法：
https://www.jianshu.com/p/6c6830deeabb

关于np.random.choice（从给定一位数组中随机选取）：
https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html

关于pandas.Series.idxmax():
https://pandas.pydata.org/docs/reference/api/pandas.Series.idxmax.html#pandas.Series.idxmax

In [6]:
# define reward
def get_env_feedback(S, A):
    # This is how agent will interact with the environment
    if A == 'right':    # move right
        if S == N_STATES - 2:   # terminate
            S_ = 'terminal'
            R = 1
        else:
            S_ = S + 1
            R = 0
    else:   # move left
        R = 0
        if S == 0:
            S_ = S  # reach the wall
        else:
            S_ = S - 1
    return S_, R

In [7]:
# 
def update_env(S, episode, step_counter):
    # This is how environment be updated
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
    if S == 'terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)


In [8]:
def rl():
    # main part of RL loop
    q_table = build_q_table(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S = 0
        is_terminated = False
        update_env(S, episode, step_counter)
        while not is_terminated:

            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A)  # take action & get next state and reward
            q_predict = q_table.loc[S, A]
            if S_ != 'terminal':
                q_target = R + GAMMA * q_table.iloc[S_, :].max()   # next state is not terminal
            else:
                q_target = R     # next state is terminal
                is_terminated = True    # terminate this episode

            q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # update
            S = S_  # move to next state

            update_env(S, episode, step_counter+1)
            step_counter += 1
        print('episode '+str(episode)+'  total steps: '+str(step_counter))
        print(q_table)
    return q_table

In [9]:
if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table:\n')
    print(q_table)
# -----T

                                episode 0  total steps: 38
   left  right
0   0.0    0.0
1   0.0    0.0
2   0.0    0.0
3   0.0    0.0
4   0.0    0.1
5   0.0    0.0
                                episode 1  total steps: 22
   left  right
0   0.0  0.000
1   0.0  0.000
2   0.0  0.000
3   0.0  0.009
4   0.0  0.190
5   0.0  0.000
                                episode 2  total steps: 9
   left    right
0   0.0  0.00000
1   0.0  0.00000
2   0.0  0.00081
3   0.0  0.02520
4   0.0  0.27100
5   0.0  0.00000
                                episode 3  total steps: 5
   left     right
0   0.0  0.000000
1   0.0  0.000073
2   0.0  0.002997
3   0.0  0.047070
4   0.0  0.343900
5   0.0  0.000000
                                episode 4  total steps: 7
      left     right
0  0.00000  0.000007
1  0.00000  0.000572
2  0.00003  0.006934
3  0.00000  0.073314
4  0.00000  0.409510
5  0.00000  0.000000
                                episode 5  total steps: 5
      left     right
0  0.00000  0.000057
1  0.0

例子中的Q（0,left）、Q（1,left）等其实也有value，但是由于episode少而且epsilon=0.9，
还没有发生在Q（0，right）被update为非0值后，在state0/state1中采取non-greedy的情况，所以他们很可能依然是0