In [None]:
import gym
import matplotlib.pyplot as plt
import numpy as np
import copy

In [None]:
class On_policy_MC:
    def __init__(self, env, start_state, gamma=0.8):
        #初始化行为值函数
        self.state_size = env.observation_space.n
        self.action_size = env.action_space.n
        self.qvalue = np.zeros((self.state_size, self.action_size))

        #初始化每个状态-动作对的次数
        self.n = np.zeros((self.state_size, self.action_size))

        self.states = []    # 状态空间500
        for i in range(self.state_size):
            self.states.append(i)

        self.actions = []   # 动作空间6
        for i in range(self.action_size):
            self.actions.append(i)

        self.gamma = gamma
        self.epsilon = 0.5
        self.env = env
        self.start_state = start_state

        #初始化策略
        self.Pi = (1/self.action_size) * np.ones((self.state_size, self.action_size))
        self.Greedy_Pi = np.zeros((self.state_size,self.action_size))
        self.cur_state = 0
        self.cur_action = 0
        self.old_policy = np.ones((self.state_size, self.action_size))
        #################状态转移概率P(s'|s,a)与回报模型模型构建#################################
        self.P_ssa = np.zeros((self.action_size, self.state_size, self.state_size))
        self.r_sa = np.zeros((self.state_size, self.action_size))

        for i in range(self.state_size):
            for j in range(self.action_size):
                next = self.env.P[i][j]
                for k in range(len(next)):
                    probability, next_state, reward, done = next[k]
                    self.P_ssa[j][i][next_state] += probability
                    self.r_sa[i][j] += probability*(reward)
                    # if reward == -1:
                    #     self.r_sa[i][j] += probability*(-0.1)
                    # elif reward == -10:
                    #     self.r_sa[i][j] += probability*(-1)
                    # else:
                    #     self.r_sa[i][j] += probability*(1000)
                    
        
    #重置环境函数
    def reset(self):
        # 初始化行为值函数
        self.qvalue = np.zeros((self.state_size, self.action_size))
        # 初始化每个状态-动作对的次数
        self.n = np.zeros((self.state_size, self.action_size))
    #探索初始化函数
    def explore_init(self):
        state_prob = (1/self.state_size)*np.ones((1, self.state_size))
        s0 = np.random.choice(self.states,p=state_prob)
        action_prob = (1/self.action_size)*np.ones((1, self.action_size))
        a0 = np.random.choice(self.actions,p=action_prob)
        return s0,a0
    #根据策略pi采样一个动作
    def sample_action(self,state):
        action = np.random.choice(self.actions,p=self.Pi[state,:])
        # print("random action: ",action)
        return action
    #跟环境交互一步
    def step(self,action):
        probability, next_state, r_next, done = self.env.P[self.cur_state][action][0]
        # if r_next == -1:
        #     r_next = -0.1
        # elif r_next == -10:
        #     r_next = -1
        # else:
        #     r_next = 1000

        return next_state,r_next,done
    #############策略改进源代码##########
    def update_policy(self):
        for i in range(self.state_size):
            self.Pi[i,:]=0
            max_num = np.argmax(self.qvalue[i,:])
            self.Pi[i, max_num] = 1
    def get_greedy_policy(self):
        for i in range(self.state_size):
            self.Greedy_Pi[i,:]=0
            max_num = np.argmax(self.qvalue[i, :])
            self.Greedy_Pi[i, max_num] = 1
        return self.Greedy_Pi

    def update_epsilon_greedy(self):
        for i in range(self.state_size):
            self.Pi[i,:]=self.epsilon/self.action_size
            max_num = np.argmax(self.qvalue[i,:])
            self.Pi[i, max_num] = self.epsilon/self.action_size+(1-self.epsilon)


    #蒙特卡洛强化学习算法
    def MC_learning(self):
        num = 0
        while num<int(1e5):
            num+=1
            flag=False
            #采样一条轨迹
            state_traj=[]
            action_traj = []
            reward_traj=[]
            g = 0
            episode_num = 0
            # 从初始状态出发
            self.cur_state = self.start_state
            while flag==False and episode_num<100:
                #与环境交互一次
                cur_action = self.sample_action(self.cur_state)
                # print("cur_action: ",cur_action)
                state_traj.append(self.cur_state)
                action_traj.append(cur_action)
                next_state, reward, flag = self.step(cur_action)
                # print('reward:', reward)
                reward_traj.append(reward)
                self.cur_state = next_state
                episode_num += 1
            print("state_traj",state_traj)
            # print("reward_traj",reward_traj)
            ############利用采集到的轨迹更新行为值函数################
            for i in reversed(range(len(state_traj))):
                #计算状态-动作对(s,a)的访问频次
                self.n[state_traj[i],action_traj[i]]+=1.0
                #利用增量式方式更新当前状态动作值
                g*=self.gamma
                g+=reward_traj[i]
                self.qvalue[state_traj[i],action_traj[i]]=\
                    (self.qvalue[state_traj[i], action_traj[i]]*(self.n[state_traj[i],action_traj[i]]-1)+g)/ \
                    self.n[state_traj[i], action_traj[i]]
            # if state_traj[0] == 1 and action_traj[0] == 3:
            #     print("state_traj", state_traj)
            #     print("状态频次及值函数", self.n[1, 3],self.qvalue[1,3] )
            ###########更新策略################
            if num%200==0:
                self.old_policy = copy.deepcopy(self.Pi)
                self.update_epsilon_greedy()
                self.epsilon = self.epsilon*0.99
                self.n = np.zeros((self.state_size,self.action_size))
                # self.reset()

In [None]:
def q_ana_evaluate(Pi,r_sa,P_ssa,gamma):
    state_size = 500
    action_size = 6
    # state_size = 16
    # action_size = 4
    P_pi = np.zeros((state_size, state_size))
    C_pi = np.zeros((state_size, 1))

    for i in range(state_size):
        # 计算pi(a|s)*p(s'|s,a)
        P_pi[i, :] = np.dot(np.expand_dims(Pi[i, :], axis=0), P_ssa[:, i, :]).squeeze()
        # 计算pi(a|s)*r(s,a)
        C_pi[i, :] = np.dot(r_sa[i, :], Pi[i, :])

    ############解析法计算值函数######################    
    # # 找出全零行
    # zero_rows = np.where(~P_pi.any(axis=1))[0]

    # # 找出全零列
    # zero_columns = np.where(~P_pi.any(axis=0))[0]

    # print("全零行的索引：", zero_rows)
    # print("全零列的索引：", zero_columns)

    # print("det of P_pi", np.linalg.det(P_pi))

    M = np.eye(state_size) - P_pi
    
    M_det = np.linalg.det(M)
    print("det(M)=", M_det)

    # 找出全零行
    zero_rows = np.where(~M.any(axis=1))[0]

    # 找出全零列
    zero_columns = np.where(~M.any(axis=0))[0]

    print("全零行的索引：", zero_rows)
    print("全零列的索引：", zero_columns)

    I_M = np.linalg.inv(M)
    V = np.dot(I_M, C_pi)
    #计算行为值函数
    q_value = np.zeros((state_size, action_size))
    for i in range(state_size):
        q_sa = np.zeros((1, action_size))
        for j in range(action_size):
            Pi[i, :] = 0
            Pi[i, j] = 1
            P_pi[i, :] = np.dot(np.expand_dims(Pi[i, :], axis=0), P_ssa[:, i, :]).squeeze()
            vi = np.dot(r_sa[i, :], Pi[i, :]) + np.dot(P_pi[i, :], V.squeeze())
            q_sa[0, j] = vi
        q_value[i, :] = q_sa[0, :]
    return q_value

# q_real_value = q_ana_evaluate(on_policy_MC.Pi,on_policy_MC.r_sa,on_policy_MC.P_ssa, gamma=0.8)

In [None]:
# env = gym.make('FrozenLake-v1', desc=None, map_name="8x8", is_slippery=True, render_mode="rgb_array_list")
env = gym.make('Taxi-v3', render_mode="rgb_array_list")
# env.reset()
state = env.reset(seed=42, options={})[0]
on_policy_MC = On_policy_MC(env=env, start_state=state, gamma=0.5)

print("initial policy",on_policy_MC.Pi)
print(on_policy_MC.r_sa)

In [None]:
on_policy_MC.MC_learning()
print("Final policy:\n",np.around(on_policy_MC.Pi,2))
print("current qvalue: \n",np.around(on_policy_MC.qvalue,2))


In [None]:
# q_real_value = q_ana_evaluate(on_policy_MC.Pi,on_policy_MC.r_sa,on_policy_MC.P_ssa,gamma=0.8)
# print("real qvalue:\n",np.around(q_real_value,2))
# print("值函数差的范数：\n",np.linalg.norm(on_policy_MC.qvalue-q_real_value))

In [None]:
print("访问频次",on_policy_MC.n)
print("探索因子：",on_policy_MC.epsilon)
# print("贪婪策略:",on_policy_MC.get_greedy_policy())
print("贪婪策略:",on_policy_MC.Greedy_Pi)

In [None]:
greedy_action = np.argmax(on_policy_MC.Pi, axis=1)
greedy_action

In [None]:
frame = []
# state = env.reset()[0]
action_list = []
print(state)
# print(state)
# 循环交互
while True:
    # 按照策略选取动作
    action = greedy_action[state]
    print("state:", state)
    print("action:", action)
    action_list.append(action)
    # frame.append(state)

    # agent与环境进行一步交互
    state, reward, terminated, truncated, info = env.step(action)

    # 判断当前否完成
    if terminated:
        print('done')
        break
    # time.sleep(1)
    
frame.append(env.render())
env.close()

In [None]:
frames = np.array(frame)
frames = frames.squeeze()
len(frames[0])

In [None]:
# import os
# for i in range(frames.shape[0] - 1):
#     plt.imshow(frames[i])
#     # plt.title('action = {0}'.format(num_to_actual[action_list[i]]), fontsize=22)
#     # 去除坐标轴
#     plt.axis('off')
    
#     # 去除周围的白边
#     plt.tight_layout(pad=0)
#     plt.savefig(os.path.join('./result/on_policy', 'frame_{0}.png'.format(i)))
#     # plt.show()

In [None]:
# import imageio
# def compose_gif(frame):
#     imageio.mimsave("result/on_policy/on_policy.gif", frames, duration=500)

# compose_gif(frame)