In [None]:
import gym
import matplotlib.pyplot as plt
import numpy as np
import copy

In [None]:
class Off_policy_MC:
    def __init__(self, env, start_state, gamma=0.8):
        #初始化行为值函数
        self.state_size = env.observation_space.n
        self.action_size = env.action_space.n
        self.qvalue = np.zeros((self.state_size, self.action_size))

        #初始化每个状态-动作对的次数
        self.C = np.zeros((self.state_size,self.action_size))
        self.states = []    # 状态空间500
        for i in range(self.state_size):
            self.states.append(i)

        self.actions = []   # 动作空间6
        for i in range(self.action_size):
            self.actions.append(i)
        self.gamma = gamma
        self.env = env
        self.epsilon = 0.5
        self.start_state = start_state
        
        #初始化采样策略
        self.behaviour_Pi = (1/self.action_size) * np.ones((self.state_size, self.action_size))
        #初始化目标策略
        self.target_Pi = np.zeros((self.state_size,self.action_size))
        for i in range(self.state_size):
            action_prob = (1/self.action_size)*np.ones((1, self.action_size)).squeeze()
            j = np.random.choice(self.actions,p=action_prob)
            # j = np.random.choice(self.actions, p=(1/self.action_size) * np.ones(1, self.action_size))
            self.target_Pi[i,j]=1
        self.Greedy_Pi = np.zeros((self.state_size,self.action_size))
        self.cur_state = 1
        self.cur_action = 0
        self.old_policy=np.ones((self.state_size,self.action_size))
        #################1.状态转移概率P(s'|s,a)模型构建#################################
        self.P_ssa = np.zeros((self.action_size, self.state_size, self.state_size))
        self.r_sa = np.zeros((self.state_size, self.action_size))

        for i in range(self.state_size):
            for j in range(self.action_size):
                next = self.env.P[i][j]
                for k in range(len(next)):
                    probability, next_state, reward, done = next[k]
                    self.P_ssa[j][i][next_state] += probability
                    # self.r_sa[i][j] += probability*(reward)
                    if reward == -1:
                        self.r_sa[i][j] += probability*(-0.1)
                    elif reward == -10:
                        self.r_sa[i][j] += probability*(-1)
                    else:
                        self.r_sa[i][j] += probability*(1000)

    #重置环境函数
    def reset(self):
        # 初始化行为值函数
        self.qvalue = np.zeros((self.state_size, self.action_size))
        # 初始化每个状态-动作对的次数
        self.C = np.zeros((self.state_size, self.action_size))
    #根据采样策略采样一个动作
    def sample_action(self,state):
        action = np.random.choice(self.actions,p=self.behaviour_Pi[state,:])
        return action
    #跟环境交互一步
    def step(self,action):
        probability, next_state, reward, done = self.env.P[self.cur_state][action][0]
        if reward == -1:
            reward = -0.1
        elif reward == -10:
            reward = -1
        else:
            reward = 1000
        return next_state,reward,done
    #############更新目标策略##########
    def update_target_policy(self):
        epsilon = self.epsilon/10
        for i in range(self.state_size):
            self.target_Pi[i, :] = epsilon / self.action_size
            max_num = np.argmax(self.qvalue[i, :])
            self.target_Pi[i, max_num] = epsilon / self.action_size + (1 - epsilon)
    #############更新采样策略##########
    def update_behaviour_policy(self):
        for i in range(self.state_size):
            self.behaviour_Pi[i, :] = self.epsilon / self.action_size
            max_num = np.argmax(self.qvalue[i, :])
            self.behaviour_Pi[i, max_num] = self.epsilon / self.action_size + (1 - self.epsilon)
    #############获得贪婪策略##########
    def get_greedy_policy(self):
        for i in range(self.state_size):
            self.Greedy_Pi[i, :] = 0
            max_num = np.argmax(self.qvalue[i, :])
            self.Greedy_Pi[i, max_num] = 1
        return self.Greedy_Pi
    #蒙特卡洛强化学习算法
    def Off_MC_learning(self):
        num = 0
        self.update_target_policy()
        self.update_behaviour_policy()
        while num<int(20000):
            num+=1
            flag=False
            #采样一条轨迹
            state_traj=[]
            action_traj = []
            reward_traj=[]
            g = 0
            W = 1
            episode_num = 0
            # 从初始状态出发
            self.cur_state = self.start_state
            while flag==False and episode_num<50:
                #与环境交互一次
                cur_action = self.sample_action(self.cur_state)
                state_traj.append(self.cur_state)
                action_traj.append(cur_action)
                next_state, reward, flag = self.step(cur_action)
                reward_traj.append(reward)
                self.cur_state = next_state
                episode_num += 1
            print("state_traj:",state_traj)

            ############利用采集到的轨迹更新行为值函数################
            for i in reversed(range(len(state_traj))):
                #计算状态-动作对(s,a)后的轨迹的权重和
                # print("W",W)
                self.C[state_traj[i],action_traj[i]]+=W
                #利用增量式方式更新当前状态动作值
                g*=self.gamma
                g+=reward_traj[i]
                self.qvalue[state_traj[i],action_traj[i]]=self.qvalue[state_traj[i],action_traj[i]]+\
                        (W/self.C[state_traj[i],action_traj[i]])*(g-self.qvalue[state_traj[i],action_traj[i]])
                W = W*self.target_Pi[state_traj[i],action_traj[i]]/self.behaviour_Pi[state_traj[i],action_traj[i]]
                # print("W:\n",W)
            ###########更新策略################
            if num%200==0:
                self.old_policy = copy.deepcopy(self.target_Pi)
                self.epsilon = self.epsilon * 0.99
                self.update_target_policy()
                self.update_behaviour_policy()
                self.C = np.zeros((self.state_size,self.action_size))


In [None]:
def q_ana_evaluate(Pi,r_sa,P_ssa):
    state_size = 500
    action_size = 6
    # state_size = 16
    # action_size = 4
    P_pi = np.zeros((state_size, state_size))
    C_pi = np.zeros((state_size, 1))
    for i in range(state_size):
        # 计算pi(a|s)*p(s'|s,a)
        P_pi[i, :] = np.dot(np.expand_dims(Pi[i, :], axis=0), P_ssa[:, i, :]).squeeze()
        # 计算pi(a|s)*r(s,a)
        C_pi[i, :] = np.dot(r_sa[i, :], Pi[i, :])
    ############解析法计算值函数######################
    M = np.eye(state_size) - 0.5*P_pi
    I_M = np.linalg.inv(M)
    V = np.dot(I_M, C_pi)
    #计算行为值函数
    q_value = np.zeros((state_size, action_size))
    for i in range(state_size):
        q_sa = np.zeros((1, action_size))
        for j in range(action_size):
            Pi[i, :] = 0
            Pi[i, j] = 1
            P_pi[i, :] = np.dot(np.expand_dims(Pi[i, :], axis=0), P_ssa[:, i, :]).squeeze()
            vi = np.dot(r_sa[i, :], Pi[i, :]) + np.dot(P_pi[i, :], V.squeeze())
            q_sa[0, j] = vi
        q_value[i, :] = q_sa[0, :]
    return q_value


In [None]:
# env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True, render_mode="rgb_array_list")
env = gym.make('Taxi-v3', render_mode="rgb_array_list")
# env.reset()
state = env.reset()[0]

off_policy_MC = Off_policy_MC(env=env, start_state = state, gamma=0.5)
off_policy_MC.Off_MC_learning()

print(off_policy_MC.get_greedy_policy())
print("估计值函数：\n",np.around(off_policy_MC.qvalue,2))
print("Final policy:\n",np.around(off_policy_MC.target_Pi,2))


In [None]:
# q_real=q_ana_evaluate(off_policy_MC.target_Pi,off_policy_MC.r_sa,off_policy_MC.P_ssa)
# print("真实值函数：\n",np.around(q_real,2))
# print("值函数差的范数：\n",np.linalg.norm(off_policy_MC.qvalue-q_real))

In [None]:
print("访问频次：\n",np.around(off_policy_MC.C,1))

greedy_action = np.argmax(off_policy_MC.target_Pi, axis=1)
greedy_action

In [None]:
frame = []
# state = env.reset()[0]
action_list = []
print(state)
# print(state)
# 循环交互
while True:
    # 按照策略选取动作
    action = greedy_action[state]
    print("state:", state)
    print("action:", action)
    action_list.append(action)
    # frame.append(state)

    # agent与环境进行一步交互
    state, reward, terminated, truncated, info = env.step(action)

    # 判断当前否完成
    if terminated:
        print('done')
        break
    # time.sleep(1)
    
frame.append(env.render())
env.close()

In [None]:
frames = np.array(frame)
frames = frames.squeeze()
len(frames[0])

In [None]:
import os
for i in range(frames.shape[0] - 1):
    plt.imshow(frames[i])
    # plt.title('action = {0}'.format(num_to_actual[action_list[i]]), fontsize=22)
    # 去除坐标轴
    plt.axis('off')
    
    # 去除周围的白边
    plt.tight_layout(pad=0)
    plt.savefig(os.path.join('./result/off_policy', 'frame_{0}.png'.format(i)))
    # plt.show()

In [None]:
import imageio
def compose_gif(frame):
    imageio.mimsave("result/off_policy/off_policy.gif", frames, duration=500)

compose_gif(frame)