In [1]:
import numpy as np
import random

In [2]:
class GridGenerator:
    def __init__(self, grid_height, grid_length, forbidden_num=None, target_num=1):
        self.grid_height, self.grid_length = grid_height, grid_length  
        self.target_num = target_num
        self.forbidden_num = forbidden_num if forbidden_num else grid_height * grid_length // 3
        
        self.grid = np.zeros((grid_height + 2, grid_length + 2))
        self.grid[0, :], self.grid[-1, :], self.grid[:, 0], self.grid[:, -1] = -1, -1, -1, -1

        self._generate_forbidden()
        self._generate_target()

    def _generate_forbidden(self):
        for _ in range(self.forbidden_num):
            self.grid[random.randint(1, self.grid_height), random.randint(1, self.grid_length)] = 1

    def _generate_target(self):
        for _ in range(self.target_num):
            self.grid[random.randint(1, self.grid_height), random.randint(1, self.grid_length)] = 2

In [3]:
class Action:
    def __init__(self, grid):
        self.grid = grid

    def __call__(self, position, action):
        x, y = position
        if action == '⬅️':
            return (x, y - 1, self.value_func((x, y - 1))) if self._check_boundary(x, y - 1) else (x, y, self.value_func((x, y - 1)))
        elif action == '➡️':
            return (x, y + 1, self.value_func((x, y + 1))) if self._check_boundary(x, y + 1) else (x, y, self.value_func((x, y + 1)))
        elif action == '⬆️':
            return (x - 1, y, self.value_func((x - 1, y))) if self._check_boundary(x - 1, y) else (x, y, self.value_func((x - 1, y)))
        elif action == '⬇️':
            return (x + 1, y, self.value_func((x + 1, y))) if self._check_boundary(x + 1, y) else (x, y, self.value_func((x + 1, y)))
        else:
            return (x, y, self.value_func((x, y)))

    def _check_boundary(self, x, y):
        return x > 0 and x <= self.grid.grid_height and y > 0 and y <= self.grid.grid_length

    def value_func(self, end):
        x, y = end
        end_status = self.grid.grid[x, y]
        if end_status == -1:
            return -100
        elif end_status == 1:
            return -300
        elif end_status == 2:
            return 500
        else:
            return 0

In [4]:
class Policy:
    def __init__(self, grid):
        self.grid = grid
        self.policy_grid = np.empty((self.grid.grid_height + 2, self.grid.grid_length + 2), dtype='U5')
        self._random_policy()

    def _random_policy(self):
        for i in range(self.grid.grid_height + 2):
            for j in range(self.grid.grid_length + 2):
                if self.grid.grid[i, j] == -1:
                    self.policy_grid[i, j] = '❌'
                else:
                    self.policy_grid[i, j] = random.choice(['⬅️', '➡️', '⬆️', '⬇️', '⭕️'])

In [5]:
class GridPolicyIteration:
    def __init__(self, grid_height, grid_length, forbidden_num=None, target_num=1, gamma=0.95, theta=0.1):
        self.state_value_bef, self.state_value_aft = None, None
        self.grid = GridGenerator(grid_height, grid_length, forbidden_num, target_num)
        self.action_func = Action(self.grid)
        self.policy = Policy(self.grid)
        
        self.gamma = gamma
        self.theta = theta
    
    def _init_state_value(self):
        self.state_value_bef = np.zeros((self.grid.grid_height + 2, self.grid.grid_length + 2))
        self.state_value_aft = np.ones((self.grid.grid_height + 2, self.grid.grid_length + 2))

        
    def policy_evaluation(self):
        self._init_state_value()

        while np.sum(np.abs(self.state_value_bef - self.state_value_aft)) > self.theta:
            self.state_value_bef = self.state_value_aft.copy()
            for i in range(1, self.grid.grid_height + 1):
                for j in range(1, self.grid.grid_length + 1):
                    (x, y, immediate_reward) = self.action_func((i, j), self.policy.policy_grid[i, j])
                    self.state_value_aft[i, j] = immediate_reward + self.gamma * self.state_value_bef[x, y]

    
    def policy_improvement(self):
        for i in range(1, self.grid.grid_height + 1):
            for j in range(1, self.grid.grid_length + 1):
                max_action_value = -np.inf
                for action in ['⬅️', '➡️', '⬆️', '⬇️', '⭕️']:
                    (x, y, immediate_reward) = self.action_func((i, j), action)
                    action_value = immediate_reward + self.gamma * self.state_value_bef[x, y]
                    if action_value > max_action_value:
                        max_action_value = action_value
                        self.policy.policy_grid[i, j] = action
                

In [6]:
grid_RL = GridPolicyIteration(20, 12, gamma=0.87, target_num=2)
old_policy = None
new_policy = grid_RL.policy.policy_grid.copy()
while not np.array_equal(old_policy, new_policy):
    old_policy = new_policy
    grid_RL.policy_evaluation()
    grid_RL.policy_improvement()
    new_policy = grid_RL.policy.policy_grid.copy()
print(grid_RL.grid.grid)
print(grid_RL.policy.policy_grid)
    

[[-1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [-1.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  0. -1.]
 [-1.  1.  0.  0.  1.  1.  0.  0.  0.  0.  0.  0.  1. -1.]
 [-1.  0.  0.  0.  0.  2.  0.  1.  0.  0.  1.  0.  0. -1.]
 [-1.  1.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0. -1.]
 [-1.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0. -1.]
 [-1.  1.  0.  0.  0.  0.  0.  1.  1.  1.  0.  0.  0. -1.]
 [-1.  1.  0.  0.  0.  1.  0.  1.  0.  1.  0.  1.  0. -1.]
 [-1.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  0. -1.]
 [-1.  1.  0.  1.  0.  1.  0.  0.  1.  1.  0.  0.  1. -1.]
 [-1.  0.  0.  0.  1.  1.  0.  0.  1.  0.  0.  0.  0. -1.]
 [-1.  0.  0.  0.  0.  0.  0.  1.  1.  0.  1.  1.  1. -1.]
 [-1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -1.]
 [-1.  0.  0.  0.  0.  1.  1.  0.  1.  1.  2.  1.  0. -1.]
 [-1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0. -1.]
 [-1.  0.  0.  1.  0.  0.  0.  1.  1.  0.  0.  0.  0. -1.]
 [-1.  0.  1.  0.  0.  1.  0.  0.  0.  0.  1.  1.  1. -1