In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
S = [(i,j) for i in range(1,8) for j in range(1,11)]

def random_state():
    idx = np.random.choice(range(70))
    return S[idx]

In [3]:
def idx(t):
    if t == 'u':
        return 0
    elif t == 'd':
        return 1
    elif t == 'l':
        return 2
    else:
        return 3

In [4]:
def u(s):
    if s == (4,8):
        return None
    elif s[0] > 1:
        return (s[0]-1,s[1])
    else:   return s
    
def d(s):
    if s == (4,8):
        return None
    elif s[0] < 7:
        return (s[0]+1,s[1])
    else:   return s
    
def l(s):
    if s == (4,8):
        return None
    elif s[1] > 1:
        return (s[0],s[1]-1)
    else:   return s
    
def r(s):
    if s == (4,8):
        return None
    elif s[1] < 10:
        return (s[0],s[1]+1)
    else:   return s
    
A = [u,d,l,r]
A_names = ['u','d','l','r']

def random_action():
    x = np.random.choice(range(4))
    return A[x].__name__


In [5]:
def reward(s,a):
    
    x = idx(a)
    s_prime = A[x](s)
    if s_prime == (4,8):
        return 10
    else: return -1


In [6]:
reward((3,5),'u')

-1

## SARSA with the random choice policy

In [7]:
def sarsa(s,a):
    if s == (4,8):
        return None
    else:
        x = idx(a)
        s_prime = A[x](s)
        rew = reward(s,a)
        a_prime = random_action()
    
        return (s,A_names[x],rew,s_prime,a_prime)

In [8]:
sarsa((3,8),'u')

((3, 8), 'u', -1, (2, 8), 'u')

In [9]:
q = {(s,a):0 for s in S for a in A_names}

In [10]:
%%time
alpha = 0.01
for k in range(100):
    for s in S:
        for a in A:
            while s != (4,8):
                (s,a,r,s_prime,a_prime) = sarsa(s,a)
                r = reward(s_prime,a_prime)
                q[s,a] = q[s,a] + alpha * (r + q[s_prime,a_prime] - q[s,a])
                (s,a) = (s_prime,a_prime)
    
    

Wall time: 11.7 s


In [11]:
q

{((1, 1), 'u'): -36.35655796270403,
 ((1, 1), 'd'): -36.72828081861755,
 ((1, 1), 'l'): -36.61272246762468,
 ((1, 1), 'r'): -36.157377310966524,
 ((1, 2), 'u'): -36.18429063850957,
 ((1, 2), 'd'): -36.207686959263114,
 ((1, 2), 'l'): -36.613014294033185,
 ((1, 2), 'r'): -34.71663313875126,
 ((1, 3), 'u'): -34.72989815211668,
 ((1, 3), 'd'): -34.55437188102447,
 ((1, 3), 'l'): -36.03359044986885,
 ((1, 3), 'r'): -32.51905607595808,
 ((1, 4), 'u'): -32.39438655150636,
 ((1, 4), 'd'): -32.36622917928946,
 ((1, 4), 'l'): -34.726393730879956,
 ((1, 4), 'r'): -29.540347865558097,
 ((1, 5), 'u'): -29.468646779719613,
 ((1, 5), 'd'): -28.771226307770085,
 ((1, 5), 'l'): -32.589789824841176,
 ((1, 5), 'r'): -26.174267590535177,
 ((1, 6), 'u'): -25.84050249392287,
 ((1, 6), 'd'): -25.288343427930165,
 ((1, 6), 'l'): -29.45865421888562,
 ((1, 6), 'r'): -22.82569348693783,
 ((1, 7), 'u'): -22.669296960076554,
 ((1, 7), 'd'): -20.53611949070727,
 ((1, 7), 'l'): -26.140970315540535,
 ((1, 7), 'r'): 

In [12]:
sarsa((1,1),u)

((1, 1), 'r', -1, (1, 2), 'r')

In [13]:
A_names[np.argmax([q[s,A_names[x]] for x in range(4)])]

'u'

In [14]:
optimal_actions = [ (s,A_names[np.argmax([q[s,A_names[x]] for x in range(4)])]) for s in S]

In [15]:
optimal_actions

[((1, 1), 'r'),
 ((1, 2), 'r'),
 ((1, 3), 'r'),
 ((1, 4), 'r'),
 ((1, 5), 'r'),
 ((1, 6), 'r'),
 ((1, 7), 'r'),
 ((1, 8), 'd'),
 ((1, 9), 'd'),
 ((1, 10), 'd'),
 ((2, 1), 'r'),
 ((2, 2), 'r'),
 ((2, 3), 'r'),
 ((2, 4), 'r'),
 ((2, 5), 'r'),
 ((2, 6), 'r'),
 ((2, 7), 'd'),
 ((2, 8), 'd'),
 ((2, 9), 'd'),
 ((2, 10), 'd'),
 ((3, 1), 'r'),
 ((3, 2), 'r'),
 ((3, 3), 'r'),
 ((3, 4), 'r'),
 ((3, 5), 'r'),
 ((3, 6), 'r'),
 ((3, 7), 'r'),
 ((3, 8), 'd'),
 ((3, 9), 'd'),
 ((3, 10), 'l'),
 ((4, 1), 'r'),
 ((4, 2), 'r'),
 ((4, 3), 'r'),
 ((4, 4), 'r'),
 ((4, 5), 'r'),
 ((4, 6), 'r'),
 ((4, 7), 'r'),
 ((4, 8), 'u'),
 ((4, 9), 'l'),
 ((4, 10), 'l'),
 ((5, 1), 'r'),
 ((5, 2), 'r'),
 ((5, 3), 'r'),
 ((5, 4), 'r'),
 ((5, 5), 'r'),
 ((5, 6), 'r'),
 ((5, 7), 'r'),
 ((5, 8), 'u'),
 ((5, 9), 'u'),
 ((5, 10), 'l'),
 ((6, 1), 'r'),
 ((6, 2), 'r'),
 ((6, 3), 'r'),
 ((6, 4), 'r'),
 ((6, 5), 'r'),
 ((6, 6), 'r'),
 ((6, 7), 'u'),
 ((6, 8), 'u'),
 ((6, 9), 'u'),
 ((6, 10), 'u'),
 ((7, 1), 'r'),
 ((7, 2), 'r'),
 (

## Windy Grid

### Windy Grid World Actions

In [16]:
def idx(t):
    if t == 'u':
        return 0
    elif t == 'd':
        return 1
    elif t == 'l':
        return 2
    else:
        return 3


def u(s):
    if s == (4,8):
        return s
    
    elif s[1] in [4,5,6,9] and s[0] > 2:
            return (s[0]-2,s[1])
        
    elif s[1] in [7,8] and s[0]>3:
            return (s[0]-3,s[1])
        
    elif s[1] in [7,8] and s[0]>2:
            return (s[0]-2,s[1])
        
    elif s[0] > 1:
        return (s[0]-1,s[1])
    else:   return s
    
def d(s):
    if s == (4,8):
        return s
    
    elif s[1] in [4,5,6,9]:
            return s
        
    elif s[1] in [7,8] and s[0]>1:
            return (s[0]-1,s[1])
        
    elif s[1] in [7,8] and s[0]==1:
            return s
        
    elif s[0] < 7:
        return (s[0]+1,s[1])
    
    else:   return s
    
def l(s):
    if s == (4,8):
        return s
    
    elif s[1] in [5,6,7,10] and s[0]>1:
            return (s[0]-1,s[1]-1)
        
    elif s[1] in [5,6,7,10] and s[0]==1:
            return (s[0],s[1]-1)
        
    elif s[1] in [8,9] and s[0]>2:
            return (s[0]-2,s[1]-1)
        
    elif s[1] in [8,9] and s[0]>1:
            return (s[0]-1,s[1]-1)
        
    elif s[1] in [8,9] and s[0]==1:
            return (s[0],s[1]-1)
        
    elif s[1] >1:
        return (s[0],s[1]-1)
    else:   return s
    
def r(s):
    if s == (4,8):
        return s
    
    elif s[1] in [3,4,5,8] and s[0]>1:
            return (s[0]-1,s[1]+1)
        
    elif s[1] in [3,4,5,8] and s[0]==1:
            return (s[0],s[1]+1)   
        
    elif s[1] in [6,7] and s[0]>2:
            return (s[0]-2,s[1]+1)
        
    elif s[1] in [6,7] and s[0]>1:
            return (s[0]-1,s[1]-1)
        
    elif s[1] in [6,7] and s[0] == 1:
            return (s[0],s[1]+1)
        
    elif s[1] < 10:
        return (s[0],s[1]+1)
    
    else:   return s
    
A = [u,d,l,r]
A_names = ['u','d','l','r']

def random_action():
    a =  np.random.choice(A_names)
    return a


    
        

    

In [17]:
def reward(s,a):
    
    x = idx(a)
    s_prime = A[x](s)
    if s_prime == (4,8):
        return 10
    else: return -1
    


In [18]:
q = {(s,a):0  for s in S for a in A_names}
q[((4,8),None)] = 0

In [19]:
def greedy_sarsa(s,a,q):
    if s == (4,8):
        return (s,a,0,s,None)
    else:
        x = idx(a)
        s_prime = A[x](s)
        if s_prime == (4,8):
            return (s,a,10,s_prime,None)
        r = reward(s,a)
        a_prime = A_names[np.argmax([q[s_prime,a] for a in A_names])]
    
        return (s,a,r,s_prime,a_prime)

In [20]:
greedy_sarsa((1,8),'d',q)

((1, 8), 'd', -1, (1, 8), 'u')

In [21]:
alpha = 0.01

for _ in range(100000):
    s = random_state()    
    a = np.random.choice(A_names)
        
    while s != (4,8):
        (s,a,r,s_prime,a_prime) = greedy_sarsa(s,a,q)
        q[s,a] = q[s,a] + alpha * (r + q[s_prime,a_prime] - q[s,a])
        (s,a) = (s_prime,a_prime)    


In [22]:
greedy_sarsa((1,8),'d',q)

((1, 8), 'd', -1, (1, 8), 'r')

In [23]:
q

{((1, 1), 'u'): -7.100275293220287,
 ((1, 1), 'd'): -7.082244493173881,
 ((1, 1), 'l'): -7.101707962064218,
 ((1, 1), 'r'): -6.000001115081442,
 ((1, 2), 'u'): -6.086245001750395,
 ((1, 2), 'd'): -6.091047304041736,
 ((1, 2), 'l'): -7.097877621391856,
 ((1, 2), 'r'): -5.000000000002443,
 ((1, 3), 'u'): -5.083442480012452,
 ((1, 3), 'd'): -5.087854658259557,
 ((1, 3), 'l'): -6.0817230792763635,
 ((1, 3), 'r'): -4.000000000000515,
 ((1, 4), 'u'): -4.159382875744306,
 ((1, 4), 'd'): -4.107455122470909,
 ((1, 4), 'l'): -5.156550885880542,
 ((1, 4), 'r'): -3.0000000000004716,
 ((1, 5), 'u'): -3.163005854761513,
 ((1, 5), 'd'): -3.2566964351965644,
 ((1, 5), 'l'): -4.166038039280099,
 ((1, 5), 'r'): -2.0000000000004494,
 ((1, 6), 'u'): -2.1765781263095363,
 ((1, 6), 'd'): -2.2266330149807083,
 ((1, 6), 'l'): -3.2479026548035073,
 ((1, 6), 'r'): -1.0000000000004272,
 ((1, 7), 'u'): -1.1606063992993718,
 ((1, 7), 'd'): -1.1566114363256388,
 ((1, 7), 'l'): -2.1128554042869556,
 ((1, 7), 'r'): -

In [24]:
optimal_actions = [ (s,A_names[np.argmax([q[s,a] for a in A_names])]) for s in S]

In [25]:
optimal_actions

[((1, 1), 'r'),
 ((1, 2), 'r'),
 ((1, 3), 'r'),
 ((1, 4), 'r'),
 ((1, 5), 'r'),
 ((1, 6), 'r'),
 ((1, 7), 'r'),
 ((1, 8), 'r'),
 ((1, 9), 'r'),
 ((1, 10), 'd'),
 ((2, 1), 'r'),
 ((2, 2), 'r'),
 ((2, 3), 'r'),
 ((2, 4), 'r'),
 ((2, 5), 'r'),
 ((2, 6), 'u'),
 ((2, 7), 'u'),
 ((2, 8), 'r'),
 ((2, 9), 'r'),
 ((2, 10), 'd'),
 ((3, 1), 'r'),
 ((3, 2), 'r'),
 ((3, 3), 'r'),
 ((3, 4), 'r'),
 ((3, 5), 'u'),
 ((3, 6), 'r'),
 ((3, 7), 'r'),
 ((3, 8), 'r'),
 ((3, 9), 'r'),
 ((3, 10), 'd'),
 ((4, 1), 'r'),
 ((4, 2), 'r'),
 ((4, 3), 'r'),
 ((4, 4), 'r'),
 ((4, 5), 'r'),
 ((4, 6), 'r'),
 ((4, 7), 'r'),
 ((4, 8), 'u'),
 ((4, 9), 'r'),
 ((4, 10), 'd'),
 ((5, 1), 'd'),
 ((5, 2), 'd'),
 ((5, 3), 'd'),
 ((5, 4), 'r'),
 ((5, 5), 'r'),
 ((5, 6), 'r'),
 ((5, 7), 'r'),
 ((5, 8), 'd'),
 ((5, 9), 'r'),
 ((5, 10), 'd'),
 ((6, 1), 'r'),
 ((6, 2), 'r'),
 ((6, 3), 'r'),
 ((6, 4), 'r'),
 ((6, 5), 'r'),
 ((6, 6), 'r'),
 ((6, 7), 'r'),
 ((6, 8), 'd'),
 ((6, 9), 'l'),
 ((6, 10), 'd'),
 ((7, 1), 'r'),
 ((7, 2), 'u'),
 (

## $\varepsilon$-Greedy SARSA

In [26]:
eps = 0.1
alpha = 0.1

In [27]:
q = {(s,a):0  for s in S for a in A_names}
q[(4,8),None] = 0

In [28]:
%%time
for k in range(1,100000):
    
#     print('episode: ',k)
    
    s = random_state()    
    a = np.random.choice(A_names)
    
    
    while s != (4,8):
        
        (s,a,r,s_prime,a_prime) = greedy_sarsa(s,a,q)

        u = np.random.random()
        if u < eps/k:
            a_prime = np.random.choice(A_names)
                
        
        q[s,a] = q[s,a] + alpha * (r + q[s_prime,a_prime] - q[s,a])

        (s,a) = (s_prime,a_prime)


Wall time: 12.2 s


In [29]:
q

{((1, 1), 'u'): -7.000000122814377,
 ((1, 1), 'd'): -7.000000179941302,
 ((1, 1), 'l'): -7.0000000282869985,
 ((1, 1), 'r'): -6.00000000000005,
 ((1, 2), 'u'): -6.000000676404984,
 ((1, 2), 'd'): -6.000000026554197,
 ((1, 2), 'l'): -7.000000035357257,
 ((1, 2), 'r'): -5.000000000000046,
 ((1, 3), 'u'): -5.000000050327884,
 ((1, 3), 'd'): -5.000000143683626,
 ((1, 3), 'l'): -6.000000027617678,
 ((1, 3), 'r'): -4.000000000000043,
 ((1, 4), 'u'): -4.00000003675898,
 ((1, 4), 'd'): -4.000000231188058,
 ((1, 4), 'l'): -5.000000037208088,
 ((1, 4), 'r'): -3.000000000000038,
 ((1, 5), 'u'): -3.000000000000038,
 ((1, 5), 'd'): -3.0000000138137404,
 ((1, 5), 'l'): -4.0000000457562175,
 ((1, 5), 'r'): -2.0000000000000364,
 ((1, 6), 'u'): -2.0000000034036534,
 ((1, 6), 'd'): -2.0000000266154627,
 ((1, 6), 'l'): -3.0000000124836603,
 ((1, 6), 'r'): -1.0000000000000342,
 ((1, 7), 'u'): -1.0000002531737886,
 ((1, 7), 'd'): -1.0000000048379865,
 ((1, 7), 'l'): -2.000000000588047,
 ((1, 7), 'r'): -3.3

In [30]:
optimal_actions = [ (s,A_names[np.argmax([q[s,a] for a in A_names])]) for s in S]
optimal_actions

[((1, 1), 'r'),
 ((1, 2), 'r'),
 ((1, 3), 'r'),
 ((1, 4), 'r'),
 ((1, 5), 'r'),
 ((1, 6), 'r'),
 ((1, 7), 'r'),
 ((1, 8), 'r'),
 ((1, 9), 'r'),
 ((1, 10), 'd'),
 ((2, 1), 'r'),
 ((2, 2), 'r'),
 ((2, 3), 'r'),
 ((2, 4), 'r'),
 ((2, 5), 'r'),
 ((2, 6), 'u'),
 ((2, 7), 'u'),
 ((2, 8), 'r'),
 ((2, 9), 'r'),
 ((2, 10), 'd'),
 ((3, 1), 'r'),
 ((3, 2), 'r'),
 ((3, 3), 'r'),
 ((3, 4), 'r'),
 ((3, 5), 'u'),
 ((3, 6), 'r'),
 ((3, 7), 'r'),
 ((3, 8), 'r'),
 ((3, 9), 'r'),
 ((3, 10), 'd'),
 ((4, 1), 'r'),
 ((4, 2), 'r'),
 ((4, 3), 'r'),
 ((4, 4), 'r'),
 ((4, 5), 'r'),
 ((4, 6), 'r'),
 ((4, 7), 'r'),
 ((4, 8), 'u'),
 ((4, 9), 'r'),
 ((4, 10), 'd'),
 ((5, 1), 'r'),
 ((5, 2), 'r'),
 ((5, 3), 'r'),
 ((5, 4), 'r'),
 ((5, 5), 'r'),
 ((5, 6), 'r'),
 ((5, 7), 'r'),
 ((5, 8), 'd'),
 ((5, 9), 'r'),
 ((5, 10), 'd'),
 ((6, 1), 'r'),
 ((6, 2), 'r'),
 ((6, 3), 'r'),
 ((6, 4), 'r'),
 ((6, 5), 'r'),
 ((6, 6), 'r'),
 ((6, 7), 'r'),
 ((6, 8), 'd'),
 ((6, 9), 'l'),
 ((6, 10), 'd'),
 ((7, 1), 'r'),
 ((7, 2), 'r'),
 (

## Q-Learning

In [31]:
eps = 0.1
alpha = 0.1

In [32]:
q = {(s,a):0  for s in S for a in A_names}
q[((4,8),None)] = 0

In [33]:
%%time
for k in range(1,10000):
    
    s = random_state()    
    
    while s != (4,8):
       
        a = A_names[np.argmax([ q[s,a] for a in A_names])]

        u = np.random.random()
        if u < eps/k:
            a = np.random.choice(A_names)
        
        
        s_prime = A[idx(a)](s)
        
        q[s,a] = q[s,a] + alpha * ((reward(s,a) + np.max([q[s_prime,a] for a in A_names]) - q[s,a]))

        s = s_prime


Wall time: 2.02 s


In [34]:
optimal_actions = [ (s,A_names[np.argmax([q[s,a] for a in A_names])]) for s in S]
optimal_actions

[((1, 1), 'r'),
 ((1, 2), 'r'),
 ((1, 3), 'r'),
 ((1, 4), 'r'),
 ((1, 5), 'r'),
 ((1, 6), 'r'),
 ((1, 7), 'r'),
 ((1, 8), 'r'),
 ((1, 9), 'r'),
 ((1, 10), 'd'),
 ((2, 1), 'd'),
 ((2, 2), 'd'),
 ((2, 3), 'd'),
 ((2, 4), 'r'),
 ((2, 5), 'r'),
 ((2, 6), 'u'),
 ((2, 7), 'd'),
 ((2, 8), 'd'),
 ((2, 9), 'r'),
 ((2, 10), 'd'),
 ((3, 1), 'r'),
 ((3, 2), 'r'),
 ((3, 3), 'r'),
 ((3, 4), 'r'),
 ((3, 5), 'u'),
 ((3, 6), 'r'),
 ((3, 7), 'r'),
 ((3, 8), 'l'),
 ((3, 9), 'r'),
 ((3, 10), 'd'),
 ((4, 1), 'u'),
 ((4, 2), 'r'),
 ((4, 3), 'u'),
 ((4, 4), 'u'),
 ((4, 5), 'r'),
 ((4, 6), 'r'),
 ((4, 7), 'd'),
 ((4, 8), 'u'),
 ((4, 9), 'r'),
 ((4, 10), 'd'),
 ((5, 1), 'd'),
 ((5, 2), 'd'),
 ((5, 3), 'd'),
 ((5, 4), 'r'),
 ((5, 5), 'r'),
 ((5, 6), 'r'),
 ((5, 7), 'd'),
 ((5, 8), 'd'),
 ((5, 9), 'r'),
 ((5, 10), 'd'),
 ((6, 1), 'r'),
 ((6, 2), 'r'),
 ((6, 3), 'r'),
 ((6, 4), 'r'),
 ((6, 5), 'r'),
 ((6, 6), 'r'),
 ((6, 7), 'r'),
 ((6, 8), 'r'),
 ((6, 9), 'l'),
 ((6, 10), 'd'),
 ((7, 1), 'r'),
 ((7, 2), 'u'),
 (

In [35]:
[q[(1,1),a] for a in A_names]

[-8.780899999999987,
 -8.685942255445841,
 -8.691006067094236,
 -6.000003580419904]

In [36]:
path = [(1,1)]
s = (1,1)
while s != (4,8):
    k = np.argmax([q[s,a] for a in A_names])
    s = A[k](s)
    path.append(s)

In [37]:
path

[(1, 1),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (1, 8),
 (1, 9),
 (1, 10),
 (2, 10),
 (3, 10),
 (4, 10),
 (5, 10),
 (6, 10),
 (7, 10),
 (6, 9),
 (4, 8)]

## The Abyss

In [38]:
S = [(i,j) for i in range(1,8) for j in range(1,14)]

In [39]:
def idx(t):
    if t == 'u':
        return 0
    elif t == 'd':
        return 1
    elif t == 'l':
        return 2
    else:
        return 3


def u(state):
    if state[0] > 1:
        return (state[0]-1,state[1])
    else: return state

def d(state):
    if state[0] < 7:
        return (state[0]+1,state[1])
    else: return state
    
def l(state):
    if state[1]>1:
        return (state[0],state[1]-1)
    else: return state
    
def r(state):
    if state[1] < 13:
        return (state[0],state[1]+1)
    else: return state
    
A = [u,d,l,r]

A_names = ['u','d','l','r']

def random_action():
    a =  np.random.choice(A_names)
    return a

In [40]:
def reward(state):
    if state[0] == 7 and state[1] < 13 and state[1] > 1:
        return -100
    else: return -1

In [41]:
def pi(state):
    return np.random.choice(A_names)

In [42]:
q = {(s,a):0 for s in S for a in A_names}

In [43]:
q

{((1, 1), 'u'): 0,
 ((1, 1), 'd'): 0,
 ((1, 1), 'l'): 0,
 ((1, 1), 'r'): 0,
 ((1, 2), 'u'): 0,
 ((1, 2), 'd'): 0,
 ((1, 2), 'l'): 0,
 ((1, 2), 'r'): 0,
 ((1, 3), 'u'): 0,
 ((1, 3), 'd'): 0,
 ((1, 3), 'l'): 0,
 ((1, 3), 'r'): 0,
 ((1, 4), 'u'): 0,
 ((1, 4), 'd'): 0,
 ((1, 4), 'l'): 0,
 ((1, 4), 'r'): 0,
 ((1, 5), 'u'): 0,
 ((1, 5), 'd'): 0,
 ((1, 5), 'l'): 0,
 ((1, 5), 'r'): 0,
 ((1, 6), 'u'): 0,
 ((1, 6), 'd'): 0,
 ((1, 6), 'l'): 0,
 ((1, 6), 'r'): 0,
 ((1, 7), 'u'): 0,
 ((1, 7), 'd'): 0,
 ((1, 7), 'l'): 0,
 ((1, 7), 'r'): 0,
 ((1, 8), 'u'): 0,
 ((1, 8), 'd'): 0,
 ((1, 8), 'l'): 0,
 ((1, 8), 'r'): 0,
 ((1, 9), 'u'): 0,
 ((1, 9), 'd'): 0,
 ((1, 9), 'l'): 0,
 ((1, 9), 'r'): 0,
 ((1, 10), 'u'): 0,
 ((1, 10), 'd'): 0,
 ((1, 10), 'l'): 0,
 ((1, 10), 'r'): 0,
 ((1, 11), 'u'): 0,
 ((1, 11), 'd'): 0,
 ((1, 11), 'l'): 0,
 ((1, 11), 'r'): 0,
 ((1, 12), 'u'): 0,
 ((1, 12), 'd'): 0,
 ((1, 12), 'l'): 0,
 ((1, 12), 'r'): 0,
 ((1, 13), 'u'): 0,
 ((1, 13), 'd'): 0,
 ((1, 13), 'l'): 0,
 ((1, 13), 'r'):

In [44]:
for j in range(10000):
    s = (7,1)
    while s not in [(7,j) for j in range(2,14)]:
        k = np.argmax([q[s,a_prime] for a_prime in A_names])
        action = A[k]
        a = A_names[k]
        s_prime = action(s)
        q[s,a] = q[s,a] + 0.01 * (reward(s_prime) + np.max([q[s_prime,a_prime] for a_prime in A_names]) - q[s,a])
        s = s_prime
    

In [45]:
path = [(7,1)]
s = (7,1)
while s not in [(7,j) for j in range(2,14)]:
    k = np.argmax([q[s,a] for a in A_names])
    s = A[k](s)
    path.append(s)

In [46]:
path

[(7, 1),
 (6, 1),
 (6, 2),
 (6, 3),
 (6, 4),
 (6, 5),
 (6, 6),
 (6, 7),
 (6, 8),
 (6, 9),
 (6, 10),
 (6, 11),
 (6, 12),
 (6, 13),
 (7, 13)]

In [47]:
for j in range(10000):
    s = (7,1)
    while s not in [(7,j) for j in range(2,14)]:
        k = np.argmax([q[s,a_prime] for a_prime in A_names])
        action = A[k]
        action_prime = np.random.choice(A)
        s_prime = action(s)
        a_prime = action_prime.__name__
        q[s,a] = q[s,a] + 0.1 * (reward(s_prime) + q[s_prime,a_prime] - q[s,a])
        s = s_prime

In [48]:
path = [(7,1)]
s = (7,1)
while s not in [(7,j) for j in range(2,14)]:
    k = np.argmax([q[s,a] for a in A_names])
    s = A[k](s)
    path.append(s)

In [49]:
path

[(7, 1),
 (6, 1),
 (6, 2),
 (6, 3),
 (6, 4),
 (6, 5),
 (6, 6),
 (6, 7),
 (6, 8),
 (6, 9),
 (6, 10),
 (6, 11),
 (6, 12),
 (6, 13),
 (7, 13)]