# 第一步：安装并导入必要的库
# Шаг 1: Установите и импортируйте необходимые библиотеки

In [14]:
# 安装必要的库
!pip install gym numpy matplotlib

# 导入必要的库
import gym
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint





# 第二步：定义策略迭代算法
# Шаг 2: Определите алгоритм итерации стратегии

In [15]:
class PolicyIterationAgent:
    def __init__(self, env, gamma=0.99, theta=1e-6, max_iterations=1000):
        self.env = env
        self.gamma = gamma
        self.theta = theta
        self.max_iterations = max_iterations

        self.policy = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n
        self.value_function = np.zeros(env.observation_space.n)

    def policy_evaluation(self):
        for _ in range(self.max_iterations):
            delta = 0
            for state in range(self.env.observation_space.n):
                v = 0
                for action, action_prob in enumerate(self.policy[state]):
                    for prob, next_state, reward, done in self.env.P[state][action]:
                        v += action_prob * prob * (reward + self.gamma * self.value_function[next_state])
                delta = max(delta, np.abs(v - self.value_function[state]))
                self.value_function[state] = v
            if delta < self.theta:
                break

    def policy_improvement(self):
        policy_stable = True
        for state in range(self.env.observation_space.n):
            chosen_action = np.argmax(self.policy[state])
            action_values = np.zeros(self.env.action_space.n)
            for action in range(self.env.action_space.n):
                for prob, next_state, reward, done in self.env.P[state][action]:
                    action_values[action] += prob * (reward + self.gamma * self.value_function[next_state])
            best_action = np.argmax(action_values)
            if chosen_action != best_action:
                policy_stable = False
            self.policy[state] = np.eye(self.env.action_space.n)[best_action]
        return policy_stable

    def policy_iteration(self):
        for i in range(self.max_iterations):
            self.policy_evaluation()
            if self.policy_improvement():
                print(f"Policy iteration converged at iteration {i+1}")
                break

    def visualize_policy(self):
        print("Policy (state -> action):")
        for state in range(self.env.observation_space.n):
            print(f"{state} -> {np.argmax(self.policy[state])}")

    def visualize_value_function(self):
        print("Value Function:")
        print(self.value_function)



# 第三步：训练并评估策略迭代代理
# Шаг 3: Обучите и оцените агента итерации стратегии

In [16]:
env = gym.make("Taxi-v3")
agent = PolicyIterationAgent(env)

agent.policy_iteration()
agent.visualize_policy()
agent.visualize_value_function()


Policy iteration converged at iteration 13
Policy (state -> action):
0 -> 4
1 -> 4
2 -> 4
3 -> 4
4 -> 0
5 -> 0
6 -> 0
7 -> 0
8 -> 0
9 -> 0
10 -> 0
11 -> 0
12 -> 0
13 -> 0
14 -> 0
15 -> 0
16 -> 5
17 -> 0
18 -> 0
19 -> 0
20 -> 3
21 -> 3
22 -> 3
23 -> 3
24 -> 0
25 -> 0
26 -> 0
27 -> 0
28 -> 0
29 -> 0
30 -> 0
31 -> 0
32 -> 0
33 -> 0
34 -> 0
35 -> 0
36 -> 3
37 -> 0
38 -> 0
39 -> 0
40 -> 0
41 -> 0
42 -> 0
43 -> 0
44 -> 2
45 -> 2
46 -> 2
47 -> 2
48 -> 0
49 -> 0
50 -> 0
51 -> 0
52 -> 0
53 -> 0
54 -> 0
55 -> 0
56 -> 0
57 -> 2
58 -> 0
59 -> 0
60 -> 0
61 -> 0
62 -> 0
63 -> 0
64 -> 2
65 -> 2
66 -> 2
67 -> 2
68 -> 0
69 -> 0
70 -> 0
71 -> 0
72 -> 0
73 -> 0
74 -> 0
75 -> 0
76 -> 0
77 -> 2
78 -> 0
79 -> 0
80 -> 0
81 -> 0
82 -> 0
83 -> 0
84 -> 4
85 -> 4
86 -> 4
87 -> 4
88 -> 0
89 -> 0
90 -> 0
91 -> 0
92 -> 0
93 -> 0
94 -> 0
95 -> 0
96 -> 0
97 -> 5
98 -> 0
99 -> 0
100 -> 1
101 -> 1
102 -> 1
103 -> 1
104 -> 0
105 -> 0
106 -> 0
107 -> 0
108 -> 0
109 -> 0
110 -> 0
111 -> 0
112 -> 0
113 -> 0
114 -> 0
115 ->

# 第四步：运行训练好的策略
# Шаг 4: Запустите обученную стратегию

In [17]:
def run_agent(env, agent, episodes=5):
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = np.argmax(agent.policy[state])
            state, reward, done, info = env.step(action)
            total_reward += reward
            env.render()
        print(f"Episode {episode + 1}: Total Reward: {total_reward}")

env = gym.make("Taxi-v3")
agent = PolicyIterationAgent(env)
agent.policy_iteration()

run_agent(env, agent)
env.close()




Policy iteration converged at iteration 13


  if not isinstance(terminated, (bool, np.bool8)):
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Episode 1: Total Reward: 7
Episode 2: Total Reward: 10
Episode 3: Total Reward: 9
Episode 4: Total Reward: 11
Episode 5: Total Reward: 6
