In [None]:
import numpy as np
import random

class GridWorld:
    def __init__(self, size=100, num_obstacles=1000):
        self.size = size
        self.grid = np.zeros((size, size))
        self.start = (random.randint(0, size-1), random.randint(0, size-1))
        self.goal = (random.randint(0, size-1), random.randint(0, size-1))
        
        while self.goal == self.start:
            self.goal = (random.randint(0, size-1), random.randint(0, size-1))
        
        for _ in range(num_obstacles):
            x, y = random.randint(0, size-1), random.randint(0, size-1)
            while (x, y) == self.start or (x, y) == self.goal:
                x, y = random.randint(0, size-1), random.randint(0, size-1)
            self.grid[x, y] = -1  
        
        self.grid[self.start] = 1  
        self.grid[self.goal] = 2  
        self.current_state = self.start
    
    def reset(self):
        self.current_state = self.start
        return self.current_state
    
    def step(self, action):
        x, y = self.current_state
        if action == 0:  
            x = max(0, x - 1)
        elif action == 1:  
            x = min(self.size - 1, x + 1)
        elif action == 2:  
            y = max(0, y - 1)
        elif action == 3:  
            y = min(self.size - 1, y + 1)
        
        if self.grid[x, y] == -1:  
            return self.current_state, -1, False
        
        self.current_state = (x, y)
        
        if self.current_state == self.goal:
            return self.current_state, 100, True  
        
        return self.current_state, -0.1, False 

In [None]:
class ValueIteration:
    def __init__(self, env, discount_factor=0.99, theta=1e-4):
        self.env = env
        self.discount_factor = discount_factor
        self.theta = theta
        self.V = np.zeros((env.size, env.size))
        self.policy = np.zeros((env.size, env.size), dtype=int)
    
    def value_iteration(self):
        while True:
            delta = 0
            for x in range(self.env.size):
                for y in range(self.env.size):
                    if self.env.grid[x, y] == -1:  # Obstacle
                        continue
                    v = self.V[x, y]
                    self.V[x, y] = self.get_best_value((x, y))
                    delta = max(delta, abs(v - self.V[x, y]))
            if delta < self.theta:
                break
        self.extract_policy()
    
    def get_best_value(self, state):
        x, y = state
        if self.env.grid[x, y] == -1:  # Obstacle
            return 0
        if (x, y) == self.env.goal:
            return 100
        
        values = []
        for action in range(4):
            next_state, reward, _ = self.env.step(action)
            next_x, next_y = next_state
            values.append(reward + self.discount_factor * self.V[next_x, next_y])
        
        return max(values)
    
    def extract_policy(self):
        for x in range(self.env.size):
            for y in range(self.env.size):
                if self.env.grid[x, y] == -1:  
                    continue
                best_action = 0
                best_value = float('-inf')
                for action in range(4):
                    next_state, reward, _ = self.env.step(action)
                    next_x, next_y = next_state
                    value = reward + self.discount_factor * self.V[next_x, next_y]
                    if value > best_value:
                        best_value = value
                        best_action = action
                self.policy[x, y] = best_action

In [None]:
def test_policy(env, policy):
    state = env.reset()
    done = False
    while not done:
        x, y = state
        action = policy[x, y]
        state, _, done = env.step(action)
        print(f"Current state: {state}")
    print("Goal reached!")

env = GridWorld()

vi = ValueIteration(env)

vi.value_iteration()

test_policy(env, vi.policy)

This ran for 1.5 hour and crashed the Computer