In [23]:


import gridworlds           # import to trigger registration of the environment
import gymnasium as gym
import numpy as np

# create instance
env = gym.make("gridworld-v0")
env.reset()

# test example
sum_rewards = 0
for i in range(10000):
    _, rew, _, _, _ = env.step(env.action_space.sample())
    sum_rewards += rew

print("Summed rewards over 10.000 episodes: ", sum_rewards)
env.reset()


Summed rewards over 10.000 episodes:  -117.0


(array([0, 0]), {})

In [24]:
class Policy:
    def __init__(self, env):
        self.policy = np.full((5, 5, 4), 0.25)
        print(self.policy)

    def act(self, state):
        return np.random.choice(4, p=self.policy[state[0], state[1]])
    
    def update(self, state, action):
        self.policy[state[0], state[1]] = np.zeros(4)
        self.policy[state[0], state[1], action] = 1
        
p = Policy(env)


[[[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]]


In [25]:
UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3
def step_simulation(state, action: np.integer) -> tuple[np.ndarray, float, bool, bool, dict]:
       
        row, col = state  # Current position
        reward = get_reward(state, action)  # Calculate reward

        # Check for teleportation conditions based on special grid positions
        if row == 0 and col == 1:
            row, col = [4, 1]  # Teleport from (0, 1) to (4, 1)
        elif row == 0 and col == 3:
            row, col = [2, 3]  # Teleport from (0, 3) to (2, 3)
        else:
            # Update position based on action
            if action == UP:
                row = max(row - 1, 0)
            elif action == DOWN:
                row = min(row + 1, 5 - 1)
            elif action == RIGHT:
                col = min(col + 1, 5 - 1)
            elif action == LEFT:
                col = max(col - 1, 0)

        new_state = np.array([row, col])  # New position after action
        

        return new_state, reward, False, False, {}

def get_reward(state: np.ndarray, action: np.integer) -> float:
        row, col = state  # Current position
        # Define the size of the grid and initialize the reward matrix
        n = 5
        reward_matrix = np.zeros([n, n])
        reward_matrix[0, 1] = 10  # Positive reward at position (0, 1)
        reward_matrix[0, 3] = 5  # Positive reward at position (0, 3)
        reward = reward_matrix[row, col]  # Base reward from reward matrix

        # Apply penalty if attempting to move outside borders
        if at_border(state) and reward == 0:
            if row == 0 and action == UP:
                reward = -1.0
            if row == 5 - 1 and action == DOWN:
                reward = -1.0
            if col == 0 and action == LEFT:
                reward = -1.0
            if col == 5 - 1 and action == RIGHT:
                reward = -1.0

        return reward

def at_border(state) -> bool:
    # Check if the agent is at the border of the grid
    row, col = state
    return row == 0 or row == 5 - 1 or col == 0 or col == 5 - 1



In [26]:
MAX_RANGE = 1000
class PolicyEvaluation:
    def __init__(self, env, policy, gamma=0.9, theta=0.0001):
        self.env = env
        self.policy = policy
        self.gamma = gamma
        self.theta = theta
      
        
    def evaluate(self):
        delta = 1
        for j in range(MAX_RANGE):
            V = np.zeros((5, 5))
            if delta < self.theta:
                return V
            delta = 0
            for x in range(5):
                for y in range(5):
                    state = (x, y)
                    v=V[x,y]
                    for i in range(self.policy.policy.shape[2]):
                        next_state, reward, _, _, _  = step_simulation(state, i)
                        V[x, y] += self.policy.policy[x, y, i] * (reward + self.gamma * V[next_state[0], next_state[1]])
                    delta=max(delta, abs(v-V[x,y]))
        return V
    

In [27]:
Evaluation = PolicyEvaluation(env, p)
print(Evaluation.evaluate())

[[-0.55625    10.          2.          5.          0.56875   ]
 [-0.40331641  2.15925381  0.93583211  1.33556222  0.20726322]
 [-0.36116408  0.40457019  0.30159052  0.36835937 -0.10999222]
 [-0.34954585  0.01238048  0.07064347  0.09877564 -0.25809209]
 [-0.67427088 -0.39829858 -0.32014607 -0.29480783 -0.70972414]]
