For the control problem, we will approximate Q (semi-gradient sarsa).  This requires aproximating $|S|\times|A|$ values.  The basic idea is the same:

$$
\theta \leftarrow \theta + \alpha \left[r+ \gamma \hat Q (s^\prime,a^\prime)-\hat Q(s,a) \right] \frac{\partial \hat Q(s,a)}{\partial \theta} 
$$

The feature vector for this will need to be different, and include actions.  Suggests using $[r,c,r*r,c*c,r*c,1]$ cross product $[U,D,R,L]$, plus one bias

In [1]:
import numpy as np


class Grid: # Environment
    def __init__(self, width, height, start):
        self.width = width
        self.height = height
        self.i = start[0]
        self.j = start[1]

    def set(self, rewards, actions):
        # rewards should be a dict of: (i, j): r (row, col): reward
        # actions should be a dict of: (i, j): A (row, col): list of possible actions
        self.rewards = rewards
        self.actions = actions

    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]

    def current_state(self):
        return (self.i, self.j)

    def is_terminal(self, s):
        return s not in self.actions

    def move(self, action):
        # check if legal move first
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
        # return a reward (if any)
        return self.rewards.get((self.i, self.j), 0)

    def undo_move(self, action):
    # these are the opposite of what U/D/L/R should normally do
        if action == 'U':
            self.i += 1
        elif action == 'D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1
        # raise an exception if we arrive somewhere we shouldn't be
        # should never happen
        assert(self.current_state() in self.all_states())

    def game_over(self):
        # returns true if game is over, else false
        # true if we are in a state where no actions are possible
        return (self.i, self.j) not in self.actions

    def all_states(self):
        # possibly buggy but simple way to get all states
        # either a position that has possible next actions
        # or a position that yields a reward
        return set(list(self.actions.keys()) + list(self.rewards.keys()))


def standard_grid():
    # define a grid that describes the reward for arriving at each state
    # and possible actions at each state
    # the grid looks like this
    # x means you can't go there
    # s means start position
    # number means reward at that state
    # .  .  .  1
    # .  x  . -1
    # s  .  .  .
    g = Grid(3, 4, (2, 0))
    rewards = {(0, 3): 1, (1, 3): -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
      }
    g.set(rewards, actions)
    return g


def negative_grid(step_cost=-0.1):
    # in this game we want to try to minimize the number of moves
    # so we will penalize every move
    g = standard_grid()
    g.rewards.update({
    (0, 0): step_cost,
    (0, 1): step_cost,
    (0, 2): step_cost,
    (1, 0): step_cost,
    (1, 2): step_cost,
    (2, 0): step_cost,
    (2, 1): step_cost,
    (2, 2): step_cost,
    (2, 3): step_cost,
    })
    return g

In [2]:
def print_values(V,g):
    for i in range(g.width):
        print("-------------------------")
        for j in range(g.height):
            v=V.get((i,j),0)
            if v>=0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print("")
        
def print_policy(P,g):
    for i in range(g.width):
        print("")
        print("----------------")
        for j in range(g.height):
            p=P.get((i,j),' ')
            print(" %s |" % p,end="")

In [3]:
def max_dict(d): #just a helper function for getting max from dict
    max_key=None
    max_val = float('-inf')
    for k,v in d.items():
        if v>max_val:
            max_val=v
            max_key=k
    return max_key,max_val

In [4]:
SMALL_ENOUGH = 10e-4
GAMMA = 0.9
ALPHA = 0.1
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

# NOTE: this is only policy evaluation, not optimization

def random_action(a, eps=0.1):
  # we'll use epsilon-soft to ensure all states are visited
  # what happens if you don't do this? i.e. eps=0
  p = np.random.random()
  if p < (1 - eps):
    return a
  else:
    return np.random.choice(ALL_POSSIBLE_ACTIONS)

In [20]:
class Model:
    def __init__(self):
        self.theta=np.random.randn(25)/ np.sqrt(25)
    def za(self,ls,obsAct,desired_action):
        return [l  if obsAct==desired_action else 0 for l in ls]
    def all_zero_actions(self,ls,o):
        return self.za(ls,o,'U')+self.za(ls,o,'D')+self.za(ls,o,'R')+self.za(ls,o,'L')+[1]
    def s2x(self,s,a):
        l=[s[0]-1,s[1]-1.5,(s[0]*s[1]-3)/3,(s[0]*s[0]-2)/2,(s[1]*s[1]-4.5)/4.5,1]
        return np.array(self.all_zero_actions(l,a))
    def predict(self,s,a):
        x=self.s2x(s,a)
        return self.theta.dot(x)
    def grad(self,s,a):
        return self.s2x(s,a)
def getQs(model,s):
    Qs={}
    for a in ALL_POSSIBLE_ACTIONS:
        q_sa=model.predict(s,a)
        Qs[a]=q_sa
    return Qs

In [29]:
grid=negative_grid()
print("rewards:")
print_values(grid.rewards, grid)
# SA2IDX={}
# IDX=0
# states=grid.all_states()
# for s in states:
#     SA2IDX[s]={}
#     for a in ALL_POSSIBLE_ACTIONS:
#         SA22IDX[s][a]=IDX
model=Model()
t=1.0
t2=1.0
deltas=[]
for it in range(30000):
    if it % 100==0:
        t+=10e-3
        t2+=.01
    if it % 1000 == 0:
        print("iteration: "+str(it))
    alpha=ALPHA / t2
    #play an episode within this loop
    s=(2,0)
    grid.set_state(s)
    #set Q(s) so we can choose the first action
    Qs=getQs(model,s) #dict of actions and approx values
    a=max_dict(Qs)[0] #action with best value
    a=random_action(a,eps=0.5/t)
    biggest_change=0
    while not grid.game_over():
        r=grid.move(a)
        s2=grid.current_state()
        old_theta=model.theta.copy()
        if grid.is_terminal(s2):
            model.theta+=alpha*(r-model.predict(s,a))*model.grad(s,a)
        else:
            Qs2=getQs(model,s2)
            a2=max_dict(Qs2)[0]
            a2 = random_action(a2,eps=0.5/t)
            model.theta+=alpha*(r+GAMMA*model.predict(s2,a2)-model.predict(s,a))*model.grad(s,a)
            s=s2
            a=a2
        biggest_change=max(biggest_change,np.abs(model.theta-old_theta).sum())
    deltas.append(biggest_change)
  # determine the policy from Q*
  # find V* from Q*
policy = {}
V = {}
Q = {}
for s in grid.actions.keys():

    Qs = getQs(model, s)
    Q[s] = Qs
    a, max_q = max_dict(Qs)
    policy[s] = a
    V[s] = max_q
print("values")
print("")
print_values(V,grid)
print("policy")
print_policy(policy,grid)    
    

rewards:
-------------------------
-0.10|-0.10|-0.10| 1.00|
-------------------------
-0.10| 0.00|-0.10|-1.00|
-------------------------
-0.10|-0.10|-0.10|-0.10|
iteration: 0
iteration: 1000
iteration: 2000
iteration: 3000
iteration: 4000
iteration: 5000
iteration: 6000
iteration: 7000
iteration: 8000
iteration: 9000
iteration: 10000
iteration: 11000
iteration: 12000
iteration: 13000
iteration: 14000
iteration: 15000
iteration: 16000
iteration: 17000
iteration: 18000
iteration: 19000
iteration: 20000
iteration: 21000
iteration: 22000
iteration: 23000
iteration: 24000
iteration: 25000
iteration: 26000
iteration: 27000
iteration: 28000
iteration: 29000
values

-------------------------
 0.59| 0.78| 1.00| 0.00|
-------------------------
 0.39| 0.00| 0.64| 0.00|
-------------------------
 0.23| 0.09| 0.26| 0.81|
policy

----------------
 R | R | R |   |
----------------
 U |   | U |   |
----------------
 U | L | U | U |