In [62]:
import gym
import numpy as np
import pickle

# Define the maze environment
class MazeEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self,maze):
        self.maze = maze
        self.start = (0, 0)
        self.goal = (4, 4)
        
    def step(self, action):
        i, j = self.state
        if action == 0: # move up
            next_state = (max(i - 1, 0), j)
        elif action == 1: # move down
            next_state = (min(i + 1, self.maze.shape[0] - 1), j)
        elif action == 2: # move left
            next_state = (i, max(j - 1, 0))
        elif action == 3: # move right
            next_state = (i, min(j + 1, self.maze.shape[1] - 1))
        
        reward = 0
        done = False
        
        if self.maze[next_state] == 1:
            next_state = self.state
            reward = -1
        elif next_state == self.goal:
            reward = 1
            done = True
        
        self.state = next_state
        return next_state, reward, done, {}
    
    def reset(self):
        self.state = self.start
        return self.state
    
    def render(self, mode='human'):
        pass

In [63]:
# Define the Q-learning algorithm
class QLearning:
    def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.1, epochs=1000, episodes=50):
      self.env = env
      self.alpha = alpha
      self.gamma = gamma
      self.epsilon = epsilon
      self.epochs = epochs
      self.episodes = episodes
      self.q_table = {}
      for i in range(env.maze.shape[0]):
          for j in range(env.maze.shape[1]):
              self.q_table[(i, j)] = [0] * 4
              
    def train(self):
        for e in range(self.epochs):
            for episode in range(self.episodes):
                state = self.env.reset()
                done = False
                while not done:
                    if np.random.uniform(0, 1) < self.epsilon:
                        action = np.random.choice(range(4))
                    else:
                        action = np.argmax(self.q_table[state])
                    next_state, reward, done, _ = self.env.step(action)
                    old_q = self.q_table[state][action]
                    next_q = np.max(self.q_table[next_state])
                    new_q = old_q + self.alpha * (reward + self.gamma * next_q - old_q)
                    self.q_table[state][action] = new_q
                    state = next_state
                    
    def test(self):
        state = self.env.reset()
        done = False
        while not done:
            action = np.argmax(self.q_table[state])
            next_state, _, done, _ = self.env.step(action)
            print(next_state)
            state = next_state
    
# Create the environment and the Q-learning agent
env = MazeEnv(maze=np.load('/content/sample_maze.npy'))
agent = QLearning(env)

In [64]:
# Train the Q-learning agent
agent.train()

# Test the Q-learning agent
agent.test()

(0, 1)
(0, 2)
(1, 2)
(2, 2)
(2, 3)
(2, 4)
(3, 4)
(4, 4)


In [66]:
f = open('saved_model', 'wb')

# dump information to that file

pickle.dump(agent, f)

# close the file
f.close()