Most of generalized policy iteration involved doing policy eval and then greedy improvement.  These methods, which use current best policy, are on-policy methods.  Q learning is an off-policy method.  

Q learning looks similar to sarsa in the update step:

SARSA
$$
Q(s,a) \leftarrow Q(s,a) + \alpha \left[ r + \gamma Q(s^\prime,a^\prime)-Q(s,a) \right]
$$

Q-learn
$$
Q(s,a) \leftarrow Q(s,a) + \alpha \left[ r + \gamma \max_{a^\prime}Q(s^\prime,a^\prime)-Q(s,a) \right],
$$
where we are updating Q based on the max over all actions.
It seems like these chould be the same, since isn't $a^\prime = \text{argmax}_{a^\prime} \left\{ Q(s^\prime,a) \right\}$.  This part is true, but the difference is that because Q learning is off policy, we do not actually need to do $a^\prime$ next.  We use $Q(s^\prime,a^\prime)$ even if we do not do $a^\prime$ next.  It does not matter what policy we follow, we can choose random actions, although it may take a long time for the episode to finish.  If, however, we use a greedy policy for Q learning, then we are doing Q-learning and sarsa


In [1]:
def print_values(V,g):
    for i in range(g.width):
        print("-------------------------")
        for j in range(g.height):
            v=V.get((i,j),0)
            if v>=0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print("")
        
def print_policy(P,g):
    for i in range(g.width):
        print("")
        print("----------------")
        for j in range(g.height):
            p=P.get((i,j),' ')
            print(" %s |" % p,end="")

In [2]:
import numpy as np


class Grid: # Environment
    def __init__(self, width, height, start):
        self.width = width
        self.height = height
        self.i = start[0]
        self.j = start[1]

    def set(self, rewards, actions):
        # rewards should be a dict of: (i, j): r (row, col): reward
        # actions should be a dict of: (i, j): A (row, col): list of possible actions
        self.rewards = rewards
        self.actions = actions

    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]

    def current_state(self):
        return (self.i, self.j)

    def is_terminal(self, s):
        return s not in self.actions

    def move(self, action):
        # check if legal move first
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
        # return a reward (if any)
        return self.rewards.get((self.i, self.j), 0)

    def undo_move(self, action):
    # these are the opposite of what U/D/L/R should normally do
        if action == 'U':
            self.i += 1
        elif action == 'D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1
        # raise an exception if we arrive somewhere we shouldn't be
        # should never happen
        assert(self.current_state() in self.all_states())

    def game_over(self):
        # returns true if game is over, else false
        # true if we are in a state where no actions are possible
        return (self.i, self.j) not in self.actions

    def all_states(self):
        # possibly buggy but simple way to get all states
        # either a position that has possible next actions
        # or a position that yields a reward
        return set(list(self.actions.keys()) + list(self.rewards.keys()))


def standard_grid():
    # define a grid that describes the reward for arriving at each state
    # and possible actions at each state
    # the grid looks like this
    # x means you can't go there
    # s means start position
    # number means reward at that state
    # .  .  .  1
    # .  x  . -1
    # s  .  .  .
    g = Grid(3, 4, (2, 0))
    rewards = {(0, 3): 1, (1, 3): -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
      }
    g.set(rewards, actions)
    return g


def negative_grid(step_cost=-0.1):
    # in this game we want to try to minimize the number of moves
    # so we will penalize every move
    g = standard_grid()
    g.rewards.update({
    (0, 0): step_cost,
    (0, 1): step_cost,
    (0, 2): step_cost,
    (1, 0): step_cost,
    (1, 2): step_cost,
    (2, 0): step_cost,
    (2, 1): step_cost,
    (2, 2): step_cost,
    (2, 3): step_cost,
    })
    return g

In [3]:
def max_dict(d): #just a helper function for getting max from dict
    max_key=None
    max_val = float('-inf')
    for k,v in d.items():
        if v>max_val:
            max_val=v
            max_key=k
    return max_key,max_val

In [5]:
SMALL_ENOUGH = 10e-4
GAMMA = 0.9
ALPHA = 0.1
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

# NOTE: this is only policy evaluation, not optimization

def random_action(a, eps=0.1):
  # we'll use epsilon-soft to ensure all states are visited
  # what happens if you don't do this? i.e. eps=0
  p = np.random.random()
  if p < (1 - eps):
    return a
  else:
    return np.random.choice(ALL_POSSIBLE_ACTIONS)

In [6]:
grid=negative_grid()
print("rewards:")
print_values(grid.rewards, grid)

Q={}
returns={}
states = grid.all_states()
#this is just initialization
for s in states:
        Q[s]={}
        for a in ALL_POSSIBLE_ACTIONS:
            Q[s][a]=0
    
update_ct={} #see what proportion of time we spend in each state
#would be interesting to see what states are "attractive"
update_ct_sa={} #for the adaptive learning rate
for s in states:
    update_ct_sa[s]={}
    for a in ALL_POSSIBLE_ACTIONS:
        update_ct_sa[s][a]=1.0
        
#until convergence (but limit here to n runs)
t=1.0
deltas=[]#for debug
for it in range(1000):
    if it % 100 ==0:
        t+=10e-3
    if it %2000==0:
        print("iter "+str(it))

    #main loop
    s=(2,0)
    grid.set_state(s)
    a=max_dict(Q[s])[0] #choose best action
    
    biggest_change=0
    while not grid.game_over():
        a=random_action(a,eps=0.95/t) #here is the difference.  in sarsa, 
        # you similarly randomize your action bc this comes form eps greedy
        #but then you get the Q value for the action you chose and not the best
        # as we do here
        r=grid.move(a)
        s2=grid.current_state()
        #remove the lines below bc we are not required to choose the action we
        #use for scoring
#         a2=max_dict(Q[s2])[0] 
#         a2=random_action(a2,eps=0.95/t)
        
        #update Q as we go
        alp=ALPHA/update_ct_sa[s][a]
        update_ct_sa[s][a]+=.005
        old_q=Q[s][a]
        a2,max_q_s2=max_dict(Q[s2])
        Q[s][a]=Q[s][a]+alp*(r+GAMMA*max_q_s2-Q[s][a]) #use max even without taking it
        biggest_change=max(biggest_change,abs(old_q-Q[s][a]))
        update_ct[s]=update_ct.get(s,0)+1
        
        #now next state becoems current
        s=s2
        a=a2
    deltas.append(biggest_change)

policy={}
V={}
for s in grid.actions.keys():
    a,max_q=max_dict(Q[s])
    policy[s]=a
    V[s]=max_q
    
print("update counts")
total = np.sum(list(update_ct.values()))
for k,v in update_ct.items():
    update_ct[k]=float(v)/total
print_values(update_ct,grid)
    

    
print("values")
print("")
print_values(V,grid)
print("policy")
print_policy(policy,grid)
    

rewards:
-------------------------
-0.10|-0.10|-0.10| 1.00|
-------------------------
-0.10| 0.00|-0.10|-1.00|
-------------------------
-0.10|-0.10|-0.10|-0.10|
iter 0
update counts
-------------------------
 0.15| 0.12| 0.07| 0.00|
-------------------------
 0.18| 0.00| 0.05| 0.00|
-------------------------
 0.20| 0.12| 0.08| 0.04|
values

-------------------------
 0.62| 0.80| 1.00| 0.00|
-------------------------
 0.46| 0.00| 0.80| 0.00|
-------------------------
 0.31| 0.46| 0.62| 0.46|
policy

----------------
 R | R | R |   |
----------------
 U |   | U |   |
----------------
 U | R | U | L |