In [20]:
pip install rlgridworld

Collecting rlgridworld
  Downloading rlgridworld-0.1004-py3-none-any.whl (5.9 kB)
Collecting gym>=0.26.2 (from rlgridworld)
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.26.2-py3-none-any.whl size=827620 sha256=4c08d8840f57a713c3864f478ee89d63648ea65b7b2837fce06492f87829e659
  Stored in directory: /root/.cache/pip/wheels/b9/22/6d/3e7b32d98451b4cd9d12417052affbeeeea012955d437da1da
Successfully built gym
Installing collected packages: gym, rlgridworld
  Attempting uninstall: gym
    Found existing installation: gym 0.25.2
    Uninstalling gym-0.25.2:
    

In [26]:
class Grid:
    def __init__(self, width, height, start):
        self.width = width
        self.height = height
        self.i = start[0]
        self.j = start[1]

    def set(self, rewards, actions):
        self.rewards = rewards
        self.actions = actions

    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]

    def current_state(self):
        return (self.i, self.j)

    def is_terminal(self, s):
        return s not in self.actions

    def game_over(self):
        return (self.i, self.j) not in self.actions

    def move(self, action):
        x = self.rewards.get((self.i, self.j), -1)
        if action in self.actions[self.i, self.j]:
            if action == 'U' and self.i != 0:
                self.i -= 1
            elif action == 'D' and self.i != self.height - 1:
                self.i += 1
            elif action == 'R' and self.j != self.width - 1:
                self.j += 1
            elif action == 'L' and self.j != 0:
                self.j -= 1
        return x

    def all_states(self):
        return set(list(self.actions.keys()) + list(self.rewards.keys()))

    def undo_move(self, action):
        if action == 'U':
            self.i += 1
        elif action == 'D':
            self.i -= 1
        elif action == 'R':
            self.j -= 1
        elif action == 'L':
            self.j += 1

def grid():
    grd = Grid(4, 4, (0, 0))
    rewards = {(3, 3): 0, (0, 0): 0}
    actions = {
        (0, 0): ('D','R'),
        (0, 1): ('L','D','R'),
        (0, 2): ('L','D', 'R'),
        (0, 3): ('L','D','R'),
        (1, 0): ('U', 'D', 'R'),
        (1, 1): ('L', 'U', 'D', 'R'),
        (1, 2): ('L', 'U', 'D', 'R'),
        (1, 3): ('L', 'U', 'D'),
        (2, 0): ('U', 'D', 'R'),
        (2, 1): ('L', 'U', 'D', 'R'),
        (2, 2): ('L', 'U', 'D', 'R'),
        (2, 3): ('L', 'U', 'D'),
        (3, 0): ('L', 'U', 'D', 'R'),
        (3, 1): ('L', 'U', 'R'),
        (3, 2): ('L', 'U',  'R'),
        (3, 3): ('L', 'U',  'R'),

    }
    grd.set(rewards, actions)
    return grd


In [28]:
def iterative_policy_evaluation(grid, policy, discount_factor=1.0, theta=0.0001):
    V = {state: 0 for state in grid.all_states()}
    while True:
        delta = 0
        for state in grid.all_states():
            v = V[state]
            if not grid.is_terminal(state):
                action = policy[state]
                next_state = grid.move(action)
                reward = grid.rewards.get(next_state, 0)
                if next_state in V:  # Ensure next_state exists in V
                    V[state] = reward + discount_factor * V[next_state]
                    delta = max(delta, abs(v - V[state]))
                grid.undo_move(action)
        if delta < theta:
            break
    return V


In [29]:
grid_world = grid()
random_policy = {(i, j): np.random.choice(grid_world.actions[(i, j)]) for i in range(4) for j in range(4)}
values = iterative_policy_evaluation(grid_world, random_policy)
print("Values:", values)

Values: {(0, 1): 0, (1, 2): 0, (2, 1): 0, (0, 0): 0, (3, 1): 0, (1, 1): 0, (0, 3): 0, (2, 0): 0, (3, 0): 0, (2, 3): 0, (0, 2): 0, (3, 3): 0, (2, 2): 0, (1, 0): 0, (3, 2): 0, (1, 3): 0}
