In [54]:
# gym-maze-trustycoder83
import gym_maze
import gym
import numpy as np
import math
import random
import os

class Agent(object):
    def __init__(self):

        self.env = gym.make('maze-sample-5x5-v0')
        self.dim = 5
        self.os_n = self.dim * self.dim      # |S|
        self.as_n = self.env.action_space.n  # |A|

        self.c = 5
        # self.delta = .01
        self.delta = .1
        # self.epsilon = .01
        self.epsilon = .1
        self.discount_rate = .83
        self.learning_rate = 0.1
        self.difference_estimation = []

        initial_value = math.log(1. * self.c * self.os_n * self.as_n / self.delta)
        self.Q_up = np.zeros((self.dim, self.dim, self.as_n)) + initial_value
        self.Q_down = np.zeros((self.dim, self.dim, self.as_n)) - initial_value
        self.n = np.ones([self.dim, self.dim, self.as_n])
        self.U = [list([[k for k in range (self.as_n)] for j in range(self.dim)]) for i in range(self.dim)]
        self.iteration = 0
        self.t_max = 300


    def model_free_action_elimination(self):
        s1, s2 = self.reset()
        while True:
            
            for i in range(self.t_max):
                self.U[s1][s2] = list()

                # obtain U-set
                for a in range(self.as_n):
                    
                    if self.Q_up[s1][s2][a] >= np.max(self.Q_down[s1][s2]):
                        self.U[s1][s2].append(a)

                a = random.sample(self.U[s1][s2], 1)[0]
                s1_next, s2_next, r, done, _ = self.step(a)

                # count(选中次数)
                n = float(self.n[s1][s2][a])
                # print(self.Q_up[s][a], self.Q_down[s][a])
                    
                # Update.
                self.Q_up[s1][s2][a] = (1. - self.learning_rate) * self.Q_up[s1][s2][a] + self.learning_rate * (r + self.discount_rate * np.max(self.Q_up[s1_next][s2_next]) + self.beta(n))
                self.Q_down[s1][s2][a] = (1. - self.learning_rate) * self.Q_down[s1][s2][a] + self.learning_rate * (r + self.discount_rate * np.max(self.Q_down[s1_next][s2_next]) - self.beta(n))
                self.n[s1][s2][a] += 1

                s1, s2 = s1_next, s2_next
                # print(self.Q_up[s][a], self.Q_down[s][a])

                if done or i == self.t_max-1:
                    print('iteration:%d' % self.iteration)
                    print('iteration done! step: %d reward: %d, state: %d %d' % (i, r, s1, s2))
                    s1, s2 = self.reset()
                    break

            self.iteration += 1
            
            print('action set%s' %self.U[s1][s2])
            self.check_stopping_conditions()
            print('\n')

    def step(self, a):
        s, r, done, _ = self.env.step(a)
        # 将奖励大于0
        r = max(r, 0.)
        return s[0], s[1], r, done, _

    def reset(self):
        s = self.env.reset()
        return s[0], s[1]
                
    def check_stopping_conditions(self):
        for s1 in range(self.dim):
            for s2 in range(self.dim):
                for a in self.U[s1][s2]:
                    former = math.fabs(self.Q_up[s1][s2][a] - self.Q_down[s1][s2][a])
                    if former >= self.epsilon * (1-self.discount_rate) / 2.:
                        print('%f >= %f' % ((former, self.epsilon * (1-self.discount_rate) / 2)))
                        # print(np.absolute(self.Q_up - self.Q_down),np.mean(np.absolute(self.Q_up - self.Q_down)))
                        # exit()
                        self.difference_estimation.append(former)
                        return

        print('satified done')
        exit(0)
    
    
    def beta(self, k):
        return math.sqrt(math.log(1. * self.c * k * k * self.os_n * self.as_n / self.delta) / (1. * k))
    
    def animation(self, policy, count):
        for _ in range(count):
            self.run_episode(policy, True)
        self.env.close()
        
    def solution_policy(self,table):
        policy = np.argmax(table, axis=2)
        policy_scores = [self.run_episode(policy, False) for _ in range(10)]
        print("Return of solution = ", np.mean(policy_scores))
        return policy
    
    def run_episode(self, policy, render=False):
        obs = self.env.reset()
        total_reward = 0
        step_idx = 0
        for _ in range(self.t_max):
            if render:
                self.env.render()

            a,b = obs[0],obs[1]
            action = policy[a][b]
            action = int(action)
            obs, reward, done, _ = self.env.step(action)
            total_reward += self.discount_rate ** step_idx * reward
            step_idx += 1
            if done:
                break
        
        return total_reward

In [55]:
agent = Agent()
agent.model_free_action_elimination()

iteration:0
iteration done! step: 299 reward: 0, state: 3 0
action set[0, 1, 2, 3]
15.385987 >= 0.008500


iteration:1
iteration done! step: 299 reward: 0, state: 0 0
action set[0, 1, 2, 3]
14.213451 >= 0.008500


iteration:2
iteration done! step: 57 reward: 1, state: 4 4
action set[0, 1, 2, 3]
13.993814 >= 0.008500


iteration:3
iteration done! step: 299 reward: 0, state: 2 0
action set[0, 1, 2, 3]
12.593879 >= 0.008500


iteration:4
iteration done! step: 299 reward: 0, state: 1 4
action set[0, 1, 2, 3]
11.292354 >= 0.008500


iteration:5
iteration done! step: 299 reward: 0, state: 1 1
action set[0, 1, 2, 3]
10.719478 >= 0.008500


iteration:6
iteration done! step: 299 reward: 0, state: 3 4
action set[0, 1, 2, 3]
9.761528 >= 0.008500


iteration:7
iteration done! step: 219 reward: 1, state: 4 4
action set[0, 1, 2, 3]
9.712061 >= 0.008500


iteration:8
iteration done! step: 197 reward: 1, state: 4 4
action set[0, 1, 2, 3]
9.658980 >= 0.008500


iteration:9
iteration done! step: 299 rew

iteration:89
iteration done! step: 299 reward: 0, state: 0 2
action set[0, 1, 2, 3]
2.785337 >= 0.008500


iteration:90
iteration done! step: 109 reward: 1, state: 4 4
action set[0, 1, 2, 3]
2.780015 >= 0.008500


iteration:91
iteration done! step: 190 reward: 1, state: 4 4
action set[0, 1, 2, 3]
2.761519 >= 0.008500


iteration:92
iteration done! step: 299 reward: 0, state: 3 1
action set[0, 1, 2, 3]
2.743807 >= 0.008500


iteration:93
iteration done! step: 299 reward: 0, state: 3 1
action set[0, 1, 2, 3]
2.729486 >= 0.008500


iteration:94
iteration done! step: 299 reward: 0, state: 2 0
action set[0, 1, 2, 3]
2.707940 >= 0.008500


iteration:95
iteration done! step: 192 reward: 1, state: 4 4
action set[0, 1, 2, 3]
2.704249 >= 0.008500


iteration:96
iteration done! step: 299 reward: 0, state: 2 0
action set[0, 1, 2, 3]
2.675184 >= 0.008500


iteration:97
iteration done! step: 299 reward: 0, state: 1 2
action set[0, 1, 2, 3]
2.675184 >= 0.008500


iteration:98
iteration done! step: 96

iteration:171
iteration done! step: 299 reward: 0, state: 1 0
action set[0, 1, 2, 3]
2.087365 >= 0.008500


iteration:172
iteration done! step: 299 reward: 0, state: 1 0
action set[0, 1, 2, 3]
2.084513 >= 0.008500


iteration:173
iteration done! step: 202 reward: 1, state: 4 4
action set[0, 1, 2, 3]
2.081096 >= 0.008500


iteration:174
iteration done! step: 64 reward: 1, state: 4 4
action set[0, 1, 2, 3]
2.080192 >= 0.008500


iteration:175
iteration done! step: 299 reward: 0, state: 2 3
action set[0, 1, 2, 3]
2.072776 >= 0.008500


iteration:176
iteration done! step: 227 reward: 1, state: 4 4
action set[0, 1, 2, 3]
2.067605 >= 0.008500


iteration:177
iteration done! step: 299 reward: 0, state: 0 3
action set[0, 1, 2, 3]
2.064438 >= 0.008500


iteration:178
iteration done! step: 286 reward: 1, state: 4 4
action set[0, 1, 2, 3]
2.061508 >= 0.008500


iteration:179
iteration done! step: 299 reward: 0, state: 2 0
action set[0, 1, 2, 3]
2.058887 >= 0.008500


iteration:180
iteration done!

iteration:248
iteration done! step: 160 reward: 1, state: 4 4
action set[0, 1, 2, 3]
1.802424 >= 0.008500


iteration:249
iteration done! step: 299 reward: 0, state: 0 2
action set[0, 1, 2, 3]
1.796469 >= 0.008500


iteration:250
iteration done! step: 299 reward: 0, state: 2 2
action set[0, 1, 2, 3]
1.792378 >= 0.008500


iteration:251
iteration done! step: 299 reward: 0, state: 2 1
action set[0, 1, 2, 3]
1.791414 >= 0.008500


iteration:252
iteration done! step: 299 reward: 0, state: 2 2
action set[0, 1, 2, 3]
1.791414 >= 0.008500


iteration:253
iteration done! step: 299 reward: 0, state: 2 2
action set[0, 1, 2, 3]
1.788160 >= 0.008500


iteration:254
iteration done! step: 222 reward: 1, state: 4 4
action set[0, 1, 2, 3]
1.785926 >= 0.008500


iteration:255
iteration done! step: 271 reward: 1, state: 4 4
action set[0, 1, 2, 3]
1.782139 >= 0.008500


iteration:256
iteration done! step: 299 reward: 0, state: 1 0
action set[0, 1, 2, 3]
1.771729 >= 0.008500


iteration:257
iteration done

iteration:329
iteration done! step: 299 reward: 0, state: 2 0
action set[0, 1, 2, 3]
1.629677 >= 0.008500


iteration:330
iteration done! step: 299 reward: 0, state: 0 2
action set[0, 1, 2, 3]
1.628485 >= 0.008500


iteration:331
iteration done! step: 299 reward: 0, state: 1 4
action set[0, 1, 2, 3]
1.627552 >= 0.008500


iteration:332
iteration done! step: 299 reward: 0, state: 3 0
action set[0, 1, 2, 3]
1.625612 >= 0.008500


iteration:333
iteration done! step: 299 reward: 0, state: 1 2
action set[0, 1, 2, 3]
1.621611 >= 0.008500


iteration:334
iteration done! step: 299 reward: 0, state: 1 0
action set[0, 1, 2, 3]
1.617242 >= 0.008500


iteration:335
iteration done! step: 153 reward: 1, state: 4 4
action set[0, 1, 2, 3]
1.617242 >= 0.008500


iteration:336
iteration done! step: 299 reward: 0, state: 3 0
action set[0, 1, 2, 3]
1.615711 >= 0.008500


iteration:337
iteration done! step: 252 reward: 1, state: 4 4
action set[0, 1, 2, 3]
1.614612 >= 0.008500


iteration:338
iteration done

KeyboardInterrupt: 

In [56]:
Q_upper = agent.Q_up
Q_lower = agent.Q_down
solution_policy = agent.solution_policy(Q_upper)

Return of solution =  0.0390269831338038


In [57]:
agent.animation(solution_policy, 50)