In [2]:
import numpy as np
import matplotlib.pyplot as plt

<img src="images/Sutton-p76-GridWorld.png" width="800px" height="400px">

In [39]:
class GridWorld:
    def __init__(self):
        # self.worldValues = np.random.rand(4,4) * 5
        self.worldValues = np.zeros((4,4))
        self.worldValues[0,0] = 0
        self.worldValues[3,3] = 0
        self.currentState = int(np.random.rand()*8 + 2) ## The current state is randomly generated, but the world knows where you are
        self.actions = [0, 1, 2, 3]
        self.actionNames = {0:"up", 1:"down", 2:"left", 3:"right"}
        
    def step(self,action):
        self.currentState = GridWorld.move(self.currentState, action)
        if self.currentState == 0 or self.currentState == 15:
            reward = 1
        else:
            reward = -1
        return self.currentState, reward
    def get_value(self, state_no):
        x, y = GridWorld.state_coordinates(state_no)
        return self.worldValues[x,y]
    def set_value(self, state_no, value):
        x, y = GridWorld.state_coordinates(state_no)
        if state_no == 0 or state_no == 15:
            self.worldValues[x,y] = 0
        else:
            self.worldValues[x,y] = value
    
    @staticmethod
    def general_step(state, action):
        new_state = GridWorld.move(state, action)
        if new_state == 0 or new_state == 15:
            reward = 1
        else:
            reward = -1
        return new_state, reward
    @staticmethod
    def state_number(x,y):
        position = x*4 + y
        return position
    @staticmethod
    def state_coordinates(number):
        y = number % 4
        x = number // 4
        return (x,y)
    @staticmethod
    def move(s,a):
        x, y = GridWorld.state_coordinates(s)
        if x == 0 and a == 0: # Changing the x-coordinates
            x = 0
        elif x == 3 and a == 1:
            x = 3
        else:
            if a == 0:
                x = x - 1
            elif a == 1:
                x = x + 1
        
        if y == 0 and a == 2: # Changing the y-coordinates
            y = 0
        elif y == 3 and a == 3:
            y = 3
        else:
            if a == 2:
                y = y - 1
            elif a == 3:
                y = y + 1
        return GridWorld.state_number(x,y)

In [40]:
world = GridWorld()
world.worldValues

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [41]:
GridWorld.general_step(1,2)
GridWorld.general_step(0,0) # Up in state 0 -> (0, 1)

(0, 1)

In [42]:
class Agent:
    def __init__(self):
        pass
    def action(self): ## Implementing the equiprobable policy
        return np.random.randint(4)
    def policy(self):
        return 0.25

<img src="images/Sutton-PolicyEvaluation.png" width="800px" height="400px">

In [60]:
def areDeltasLargerThanTheta(d, t, while_counter):
    if while_counter < 2: ## Turning the while to a do..while loop
        return True
    for i in range(d.shape[0]):
        if abs(d[i]) > abs(t):
            return True
    return False
#def iterative_policy_evaluation():
if __name__ == "__main__":
    world = GridWorld()
    agent = Agent()
    theta = 1e-10 ## Every element in delta will be compared to this value
    deltas = np.zeros(15) # [0, 0 ...]
    while_count = 0
    while areDeltasLargerThanTheta(deltas, theta, while_count):
        ################
        k = while_count
        if k == 0 or k == 1 or k == 2 or k == 3 or k % 10 == 0:
            print("K =", k)
            print("--------")
            print(world.worldValues)
        ################
        deltas = np.zeros(15) # [0, 0 ...] Zeroing deltas every time
        while_count+=1
        for state in range(1,15):
            v = world.get_value(state) ## v <- V(s)
            new_v = 0
            for action in range(len(world.actions)):                   ## Generating the \sum \pi(a|s)
            #This is an undicounted update (no \gamma)
                new_s, r = GridWorld.general_step(state, action)       ## p(s',r | s,a) 
                new_v += agent.policy() * (r + world.get_value(new_s)) ## Adding the value of every action to the value of the state
            world.set_value(state, new_v)                              ## V(s) <- Expression
            deltas[state] = max(deltas[state], abs(new_v - v))         ## \delta <- max(\delta, |v - V(s)|)
print("Number of main loops, Last K =", while_count)
print("--------")
print(world.worldValues)

K = 0
--------
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
K = 1
--------
[[ 0.         -0.5        -1.125      -1.28125   ]
 [-0.5        -1.25       -1.59375    -1.71875   ]
 [-1.125      -1.59375    -1.796875   -1.37890625]
 [-1.28125    -1.71875    -1.37890625  0.        ]]
K = 2
--------
[[ 0.         -1.21875    -2.3046875  -2.64648438]
 [-1.21875    -2.40625    -3.05664062 -3.20019531]
 [-2.3046875  -3.05664062 -3.21777344 -2.44921875]
 [-2.64648438 -3.20019531 -2.44921875  0.        ]]
K = 3
--------
[[ 0.         -1.98242188 -3.49755859 -3.99768066]
 [-1.98242188 -3.51953125 -4.35876465 -4.50146484]
 [-3.49755859 -4.35876465 -4.4039917  -3.33866882]
 [-3.99768066 -4.50146484 -3.33866882  0.        ]]
K = 10
--------
[[  0.          -6.47871183 -10.06161583 -11.26202333]
 [ -6.47871183  -9.22317257 -10.64122233 -10.72526226]
 [-10.06161583 -10.64122233  -9.78987681  -7.3634176 ]
 [-11.26202333 -10.72526226  -7.3634176    0.        ]]
K = 20
--------
[[  0.     

<img src="images/GridWorld_values.jpeg" width="400px" height="800px">

In [29]:
deltas

array([0.00000000e+00, 5.02460296e-11, 7.64401875e-11, 8.80078233e-11,
       5.02460296e-11, 6.79918344e-11, 8.30340241e-11, 9.09210485e-11,
       7.64401875e-11, 8.30340241e-11, 9.09139430e-11, 9.56816848e-11,
       8.80149287e-11, 9.09281539e-11, 9.56745794e-11, 9.88791271e-11])

<img src="images/Exercise4.2_Sutton.jpeg" width="1000px" height="100px">

### Here we are trying to solve the second part, where the dynamic of the state 13 has changed too. This requires some additions to the GridWorld definition.

In [99]:
class GridWorld_4_2: ## We call the new state, state 17 for convenience.
    def __init__(self):
        #self.worldValues = np.random.rand(4,4) * 5
        self.worldValues = np.zeros((4,4))
        self.worldValues[0,0] = 0
        self.worldValues[3,3] = 0
        self.V_s_17 = 0 ## Here's the place we init our new state
        self.currentState = int(np.random.rand()*8 + 2) ## The current state is randomly generated, but the world knows where you are
        self.actions = [0, 1, 2, 3]
        self.actionNames = {0:"up", 1:"down", 2:"left", 3:"right"}
    
    def print_world(self):
        new_vals = np.zeros((5,4))
        for i in range(4):
            for j in range(4):
                new_vals[i,j] = self.worldValues[i,j]
        new_vals[4,2] = self.V_s_17
        print(new_vals)
        
        
    def step(self,action):
        self.currentState = GridWorld_4_2.move(self.currentState, action)
        if self.currentState == 0 or self.currentState == 15:
            reward = 1
        else:
            reward = -1
        return self.currentState, reward
    def get_value(self, state_no):
        if state_no == 17:
            return self.V_s_17
        x, y = GridWorld_4_2.state_coordinates(state_no)
        return self.worldValues[x,y]
    def set_value(self, state_no, value):
        if state_no == 17:
            self.V_s_17 = value
        else:
            x, y = GridWorld_4_2.state_coordinates(state_no)
            self.worldValues[x,y] = value
    
    @staticmethod
    def general_step(state, action):
        new_state = GridWorld_4_2.move(state, action)
        if new_state == 0 or new_state == 15:
            reward = 1
        else:
            reward = -1
        return new_state, reward
    @staticmethod
    def state_number(x,y):
        position = x*4 + y
        return position
    @staticmethod
    def state_coordinates(number):
        y = number % 4
        x = number // 4
        return (x,y)
    @staticmethod
    def move(s,a):
        """
        Here's we define the dynamic of our env.
        """
        # actionNames = {0:"up", 1:"down", 2:"left", 3:"right"}
        ########## Addede Dynamics
        if s == 17 and a == 2: # --> 12
            return 12
        if s == 17 and a == 0: # --> 13
            return 13
        if s == 17 and a == 3: # --> 14
            return 14
        if s == 17 and a == 1: # --> 17
            return 17
        if s == 13 and a == 1: # --> 17
            return 17
        ####
        x, y = GridWorld_4_2.state_coordinates(s)
        if x == 0 and a == 0: # Changing the x-coordinates
            x = 0
        elif x == 3 and a == 1:
            x = 3
        else:
            if a == 0:
                x = x - 1
            elif a == 1:
                x = x + 1
        
        if y == 0 and a == 2: # Changing the y-coordinates
            y = 0
        elif y == 3 and a == 3:
            y = 3
        else:
            if a == 2:
                y = y - 1
            elif a == 3:
                y = y + 1
        return GridWorld_4_2.state_number(x,y)

In [100]:
GridWorld_4_2.general_step(state=13,action=1)
w = GridWorld_4_2()

In [101]:
w.print_world()

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [105]:
def areDeltasLargerThanTheta(d, t, while_counter):
    if while_counter < 2: ## Turning the while to a do..while loop
        return True
    for i in range(d.shape[0]):
        if abs(d[i]) > abs(t):
            return True
    return False
#def iterative_policy_evaluation():
if __name__ == "__main__":
    world = GridWorld_4_2()
    agent = Agent()
    theta = 1e-10 ## Every element in delta will be compared to this value
    deltas = np.zeros(18) # [0, 0 ...]
    while_count = 0
    
    state_numbers = list(np.arange(1,15))
    state_numbers.append(17) # [1, 2, ... 14, 17]
    
    while areDeltasLargerThanTheta(deltas, theta, while_count):
        ################
        k = while_count
        if k == 0 or k == 1 or k == 2 or k == 3 or k % 10 == 0:
            print("K =", k)
            print("--------")
            world.print_world()
        ################
        deltas = np.zeros(18) # [0, 0 ...] Zeroing deltas every time
        while_count+=1
        
        for state in state_numbers:
            v = world.get_value(state) ## v <- V(s)
            new_v = 0
            for action in range(len(world.actions)):                   ## Generating the \sum \pi(a|s)
            #This is an undicounted update (no \gamma)
                new_s, r = GridWorld_4_2.general_step(state, action)       ## p(s',r | s,a) 
                new_v += agent.policy() * (r + world.get_value(new_s)) ## Adding the value of every action to the value of the state
            world.set_value(state, new_v)                              ## V(s) <- Expression
            deltas[state] = max(deltas[state], abs(new_v - v))         ## \delta <- max(\delta, |v - V(s)|)
print("Number of main loops, Last K =", while_count)
print("--------")
world.print_world()

K = 0
--------
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
K = 1
--------
[[ 0.         -0.5        -1.125      -1.28125   ]
 [-0.5        -1.25       -1.59375    -1.71875   ]
 [-1.125      -1.59375    -1.796875   -1.37890625]
 [-1.28125    -1.71875    -1.37890625  0.        ]
 [ 0.          0.         -2.09472656  0.        ]]
K = 2
--------
[[ 0.         -1.21875    -2.3046875  -2.64648438]
 [-1.21875    -2.40625    -3.05664062 -3.20019531]
 [-2.3046875  -3.05664062 -3.21777344 -2.44921875]
 [-2.64648438 -3.29418945 -2.47271729  0.        ]
 [ 0.          0.         -3.62702942  0.        ]]
K = 3
--------
[[ 0.         -1.98242188 -3.49755859 -3.99768066]
 [-1.98242188 -3.51953125 -4.35876465 -4.50146484]
 [-3.49755859 -4.38226318 -4.41574097 -3.34160614]
 [-4.0211792  -4.62579727 -3.37856388  0.        ]
 [ 0.          0.         -4.91314244  0.        ]]
K = 10
--------
[[  0.          -6.49205999 -10.07643369 -11.27478552]
 [ -6.50667112  -9.255473