# Exercise Sheet 9

In [2]:
import numpy as np
import itertools
import scipy.misc
import matplotlib.pyplot as plt
import sympy as sym

Link: [Branes with Brains: Exploring String Vacua with Deep Reinforcement Learning](https://arxiv.org/abs/1903.11616)

## Gridworld

* For a 5x5 Gridworld with no pitfalls and no exit, assume that we go up, down, left, right with equal probability. If we try to move over the boundary we remain at the same position. Write down the dynamics/transition probabilities explicitly.
* Using these probabilities generate 10 sample episodes.
* Calculate the reward $G_t$ for each of these samples (assuming that you get a reward of 5 in the central pixel and 1 in each corner).
* Calculate $V(s)$ for all states using the Bellman equation and via dynamic programming.
* How many non-vanishing entries can be found in each row of $P$ in a Markov Decision Process if the actions are deterministic?

### Solution

In [3]:
##Exercise 1 a)

# random position in gridworld
X = np.random.randint(0,5,2)

# dynamics:
def step(X, direction):
    if (direction == 'north' and X[1] != 4):
        X[1]+=1
    if (direction == 'east' and X[0] != 4):
        X[0]+=1
    if (direction == 'south' and X[1] != 0):
        X[1]-=1
    if (direction == 'west' and X[0] != 0):
        X[0]-=1
    return X

def randomdirection():
    return np.random.choice(['north', 'east', 'south', 'west'])

def localreward(X):
    if (np.all(X==[0,0]) or np.all(X==[4,0]) or np.all(X==[0,4]) or np.all(X==[4,4])):
        return 1
    else:
        if np.all(X==[2,2]):
            return 5
        else:
            return 0

def run(X, epochs):
    hist = np.empty([epochs, 2])
    for i in range(epochs):
        hist[i] = X
        X = step(X, randomdirection())
    return hist

def reward(hist, gamma=0.9):
    return sum([localreward(X)*gamma**i for i, X in enumerate(hist)])

In [4]:
##Exercise 1 b)
histories = [run(np.random.randint(0,5,2),1000) for i in range(10)]

##Exercise 1 c)
## gamma=1 or gamma=0.9 accepted
rewards = [reward(hist,gamma=1.) for hist in histories]
print(rewards)
print()
rewards = [reward(hist,gamma=.9) for hist in histories]
print(rewards)

[332.0, 392.0, 393.0, 384.0, 369.0, 389.0, 294.0, 390.0, 389.0, 314.0]

[2.2368072248440645, 5.761941831211949, 1.5906685782459402, 2.261019597879168, 9.007424207766466, 8.510881502167194, 0.5764680436917781, 3.1783972279108927, 6.310584287826459, 5.013234330927414]


##Exercise 1 d)

##Two possible Solutions:

Use Bellman equation

In [5]:
gamma = sym.Rational(9,10)
p = sym.Rational(1,4)

In [6]:
V = sym.IndexedBase('V')
i, j = sym.symbols('i j', cls=sym.Idx)

In [7]:
eqns = [0 for _ in range(25)]
for k, (i, j) in enumerate(itertools.product(range(5), range(5))):
    eqns[k] = V[i,j]-p*sum([localreward(step([i,j],direction))
                            +gamma*V[tuple(step([i,j],direction))] 
                            for direction in ('north', 'east', 'south', 'west')])

In [8]:
eqns

[11*V[0, 0]/20 - 9*V[0, 1]/40 - 9*V[1, 0]/40 - 1/2,
 -9*V[0, 0]/40 + 31*V[0, 1]/40 - 9*V[0, 2]/40 - 9*V[1, 1]/40 - 1/4,
 -9*V[0, 1]/40 + 31*V[0, 2]/40 - 9*V[0, 3]/40 - 9*V[1, 2]/40,
 -9*V[0, 2]/40 + 31*V[0, 3]/40 - 9*V[0, 4]/40 - 9*V[1, 3]/40 - 1/4,
 -9*V[0, 3]/40 + 11*V[0, 4]/20 - 9*V[1, 4]/40 - 1/2,
 -9*V[0, 0]/40 + 31*V[1, 0]/40 - 9*V[1, 1]/40 - 9*V[2, 0]/40 - 1/4,
 -9*V[0, 1]/40 - 9*V[1, 0]/40 + V[1, 1] - 9*V[1, 2]/40 - 9*V[2, 1]/40,
 -9*V[0, 2]/40 - 9*V[1, 1]/40 + V[1, 2] - 9*V[1, 3]/40 - 9*V[2, 2]/40 - 5/4,
 -9*V[0, 3]/40 - 9*V[1, 2]/40 + V[1, 3] - 9*V[1, 4]/40 - 9*V[2, 3]/40,
 -9*V[0, 4]/40 - 9*V[1, 3]/40 + 31*V[1, 4]/40 - 9*V[2, 4]/40 - 1/4,
 -9*V[1, 0]/40 + 31*V[2, 0]/40 - 9*V[2, 1]/40 - 9*V[3, 0]/40,
 -9*V[1, 1]/40 - 9*V[2, 0]/40 + V[2, 1] - 9*V[2, 2]/40 - 9*V[3, 1]/40 - 5/4,
 -9*V[1, 2]/40 - 9*V[2, 1]/40 + V[2, 2] - 9*V[2, 3]/40 - 9*V[3, 2]/40,
 -9*V[1, 3]/40 - 9*V[2, 2]/40 + V[2, 3] - 9*V[2, 4]/40 - 9*V[3, 3]/40 - 5/4,
 -9*V[1, 4]/40 - 9*V[2, 3]/40 + 31*V[2, 4]/40 - 9*V[3, 

Throw this at Mathematica or your favorite symbolic computation software...get:

In [9]:
V = np.array([[3.63686, 3.33395, 3.22976, 3.33395, 3.63686],[3.33395, 3.50585, 4.45684, 3.50585, 3.33395],[3.22976, 4.45684, 4.01116, 4.45684, 3.22976],[3.33395, 3.50585, 4.45684, 3.50585, 3.33395],[3.63686, 3.33395, 3.22976, 3.33395, 3.63686]])
print(V)

[[3.63686 3.33395 3.22976 3.33395 3.63686]
 [3.33395 3.50585 4.45684 3.50585 3.33395]
 [3.22976 4.45684 4.01116 4.45684 3.22976]
 [3.33395 3.50585 4.45684 3.50585 3.33395]
 [3.63686 3.33395 3.22976 3.33395 3.63686]]


Dynamic Programming:

In [10]:
V = np.array(np.ones([5,5]))
updates = np.array(np.ones([5,5]))
gamma = 0.9

In [11]:
for _ in range(500):
    for i, j in itertools.product(range(5), range(5)):
        updates[i,j] =  p*sum([localreward(step([i,j],direction))+gamma*V[tuple(step([i,j],direction))]
                        for direction in ('north', 'east', 'south', 'west')])
    for i, j in itertools.product(range(5), range(5)):
        V[i,j] = updates[i,j]

V

array([[3.63686455, 3.33394556, 3.22976082, 3.33394556, 3.63686455],
       [3.33394556, 3.50585377, 4.45684061, 3.50585377, 3.33394556],
       [3.22976082, 4.45684061, 4.01115655, 4.45684061, 3.22976082],
       [3.33394556, 3.50585377, 4.45684061, 3.50585377, 3.33394556],
       [3.63686455, 3.33394556, 3.22976082, 3.33394556, 3.63686455]])

The two results coincide.