In [7]:
import numpy as np
import gym_maze
import gym
from gym import wrappers
import time

class Agent(object):
    def __init__(self):
        self.iter_max = 10000

        self.initial_lr = 1.0 # Learning rate
        self.min_lr = 0.003
        self.discount_rate = .83
        self.t_max = 100 # limit step in an episode
        self.eps = 0.1


        self.env = gym.make('maze-sample-5x5-v0')

        self.n_states = 5
        self.n_actions = self.env.action_space.n

        self.q_table = np.zeros((self.n_states, self.n_states, 4))
    
    def run_episode(self, policy=None, render=False):
        obs = self.env.reset()
        total_reward = 0
        step_idx = 0
        for _ in range(self.t_max):
            if render:
                self.env.render()
            if policy is None:
                action = self.env.action_space.sample()
            else:
                a,b = self.obs_to_state(obs)
                action = policy[a][b]
                action = int(action)
            obs, reward, done, _ = self.env.step(action)
            total_reward += self.discount_rate ** step_idx * reward
            step_idx += 1
            if done:
                break
        return total_reward

    def obs_to_state(self, obs):
        """ Maps an observation to state """
        # we quantify the continous state space into discrete space
        a = int(obs[0])
        b = int(obs[1])
        return a, b
    
    def update_q_table(self):
        for i in range(self.iter_max):
            obs = self.env.reset()
            total_reward = 0
            ## eta: learning rate is decreased at each step
            eta = max(self.min_lr, self.initial_lr * (0.85 ** (i//100)))
            for j in range(self.t_max):
                a, b = self.obs_to_state(obs)
                if np.random.uniform(0, 1) < self.eps:
                    action = np.random.choice(self.env.action_space.n)
                else:
                    action = np.argmax(self.q_table[a][b])
                    action = int(action)
                obs, reward, done, _ = self.env.step(action)
                total_reward += reward
                # update q table
                a_, b_ = self.obs_to_state(obs)
                self.q_table[a][b][action] = self.q_table[a][b][action] + eta * (reward + self.discount_rate *  np.max(self.q_table[a_][b_]) - self.q_table[a][b][action])

                if done:
                    break
            if i % 20 == 0:
                print('Iteration #%d -- Total reward = %f' %(i+1, total_reward))
        return self.q_table
    
    def solution_policy(self,table):
        policy = np.argmax(table, axis=2)
        policy_scores = [self.run_episode(policy, False) for _ in range(10)]
        print("Average score of solution = ", np.mean(policy_scores))
        return policy
        
    def animation(self, policy, count):
        for _ in range(count):
            self.run_episode(policy, True)
        self.env.close()
        


In [8]:
def main():
    agent = Agent()
    q_table = agent.update_q_table()
    policy = agent.solution_policy(q_table)
    agent.animation(policy,10)

if __name__ == '__main__':
    main()

Iteration #1 -- Total reward = -0.400000
Iteration #21 -- Total reward = 0.932000
Iteration #41 -- Total reward = 0.932000
Iteration #61 -- Total reward = 0.928000
Iteration #81 -- Total reward = 0.932000
Iteration #101 -- Total reward = 0.928000
Iteration #121 -- Total reward = 0.936000
Iteration #141 -- Total reward = 0.920000
Iteration #161 -- Total reward = 0.936000
Iteration #181 -- Total reward = 0.928000
Iteration #201 -- Total reward = 0.940000
Iteration #221 -- Total reward = 0.940000
Iteration #241 -- Total reward = 0.940000
Iteration #261 -- Total reward = 0.924000
Iteration #281 -- Total reward = 0.928000
Iteration #301 -- Total reward = 0.936000
Iteration #321 -- Total reward = 0.924000
Iteration #341 -- Total reward = 0.928000
Iteration #361 -- Total reward = 0.936000
Iteration #381 -- Total reward = 0.928000
Iteration #401 -- Total reward = 0.924000
Iteration #421 -- Total reward = 0.932000
Iteration #441 -- Total reward = 0.924000
Iteration #461 -- Total reward = 0.9360

Iteration #3901 -- Total reward = 0.936000
Iteration #3921 -- Total reward = 0.936000
Iteration #3941 -- Total reward = 0.928000
Iteration #3961 -- Total reward = 0.932000
Iteration #3981 -- Total reward = 0.940000
Iteration #4001 -- Total reward = 0.932000
Iteration #4021 -- Total reward = 0.924000
Iteration #4041 -- Total reward = 0.932000
Iteration #4061 -- Total reward = 0.940000
Iteration #4081 -- Total reward = 0.928000
Iteration #4101 -- Total reward = 0.932000
Iteration #4121 -- Total reward = 0.932000
Iteration #4141 -- Total reward = 0.936000
Iteration #4161 -- Total reward = 0.940000
Iteration #4181 -- Total reward = 0.940000
Iteration #4201 -- Total reward = 0.936000
Iteration #4221 -- Total reward = 0.924000
Iteration #4241 -- Total reward = 0.940000
Iteration #4261 -- Total reward = 0.940000
Iteration #4281 -- Total reward = 0.928000
Iteration #4301 -- Total reward = 0.936000
Iteration #4321 -- Total reward = 0.940000
Iteration #4341 -- Total reward = 0.924000
Iteration #

Iteration #8001 -- Total reward = 0.940000
Iteration #8021 -- Total reward = 0.928000
Iteration #8041 -- Total reward = 0.936000
Iteration #8061 -- Total reward = 0.940000
Iteration #8081 -- Total reward = 0.908000
Iteration #8101 -- Total reward = 0.940000
Iteration #8121 -- Total reward = 0.932000
Iteration #8141 -- Total reward = 0.928000
Iteration #8161 -- Total reward = 0.940000
Iteration #8181 -- Total reward = 0.936000
Iteration #8201 -- Total reward = 0.932000
Iteration #8221 -- Total reward = 0.928000
Iteration #8241 -- Total reward = 0.940000
Iteration #8261 -- Total reward = 0.936000
Iteration #8281 -- Total reward = 0.940000
Iteration #8301 -- Total reward = 0.936000
Iteration #8321 -- Total reward = 0.916000
Iteration #8341 -- Total reward = 0.924000
Iteration #8361 -- Total reward = 0.940000
Iteration #8381 -- Total reward = 0.940000
Iteration #8401 -- Total reward = 0.932000
Iteration #8421 -- Total reward = 0.928000
Iteration #8441 -- Total reward = 0.924000
Iteration #