In [None]:
import numpy as np
import random

class GridWorld:
    def __init__(self, size=100, num_obstacles=1000):
        self.size = size
        self.grid = np.zeros((size, size))
        self.start = (random.randint(0, size-1), random.randint(0, size-1))
        self.goal = (random.randint(0, size-1), random.randint(0, size-1))
        
        while self.goal == self.start:
            self.goal = (random.randint(0, size-1), random.randint(0, size-1))
        
        for _ in range(num_obstacles):
            x, y = random.randint(0, size-1), random.randint(0, size-1)
            while (x, y) == self.start or (x, y) == self.goal:
                x, y = random.randint(0, size-1), random.randint(0, size-1)
            self.grid[x, y] = -1  
        
        self.grid[self.start] = 1  
        self.grid[self.goal] = 2  
        
        self.current_state = self.start
    
    def reset(self):
        self.current_state = self.start
        return self.current_state
    
    def step(self, action):
        x, y = self.current_state
        if action == 0: 
            x = max(0, x - 1)
        elif action == 1: 
            x = min(self.size - 1, x + 1)
        elif action == 2: 
            y = max(0, y - 1)
        elif action == 3: 
            y = min(self.size - 1, y + 1)
        
        if self.grid[x, y] == -1:  
            return self.current_state, -1, False
        
        self.current_state = (x, y)
        
        if self.current_state == self.goal:
            return self.current_state, 100, True  
        
        return self.current_state, -0.1, False  

In [2]:
class QLearningAgent:
    def __init__(self, state_space, action_space, learning_rate=0.1, discount_factor=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.state_space = state_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.Q = {}
    
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.choice(self.action_space)
        else:
            return self.get_best_action(state)
    
    def get_best_action(self, state):
        if state not in self.Q:
            self.Q[state] = np.zeros(len(self.action_space))
        return np.argmax(self.Q[state])
    
    def update_q_value(self, state, action, reward, next_state):
        if state not in self.Q:
            self.Q[state] = np.zeros(len(self.action_space))
        if next_state not in self.Q:
            self.Q[next_state] = np.zeros(len(self.action_space))
        
        best_next_action = self.get_best_action(next_state)
        td_target = reward + self.discount_factor * self.Q[next_state][best_next_action]
        td_error = td_target - self.Q[state][action]
        self.Q[state][action] += self.learning_rate * td_error
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

In [None]:
def train_agent(env, agent, episodes=1000):
    for episode in range(episodes):
        state = env.reset()
        done = False
        while not done:
            action = agent.get_action(state)
            next_state, reward, done = env.step(action)
            agent.update_q_value(state, action, reward, next_state)
            state = next_state
        agent.decay_epsilon()
        if episode % 100 == 0:
            print(f"Episode {episode} completed")

env = GridWorld()
agent = QLearningAgent(state_space=env.size**2, action_space=[0, 1, 2, 3])

train_agent(env, agent)

Episode 0 completed
Episode 100 completed
Episode 200 completed
Episode 300 completed
Episode 400 completed
Episode 500 completed
Episode 600 completed
Episode 700 completed
Episode 800 completed
Episode 900 completed


In [None]:
def test_agent(env, agent):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_best_action(state)
        state, _, done = env.step(action)
        print(f"Current state: {state}")
    print("Goal reached!")

test_agent(env, agent)

The Test code ran for more than 1.5 hours and slowed down the system