In [57]:
import numpy as np

In [58]:
# 定义网格世界
class GridWorld:
    def __init__(self, size=4, goal=(3, 3), reward=1):
        self.size = size
        self.goal = goal
        self.reward = reward
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # 右、左、下、上
        self.discount = 0.9

    def is_valid(self, state):
        return 0 <= state[0] < self.size and 0 <= state[1] < self.size

    def step(self, state, action):
        next_state = (state[0] + action[0], state[1] + action[1])
        if not self.is_valid(next_state):
            next_state = state  # 如果超出边界，保持原位
        reward = self.reward if next_state == self.goal else 0
        done = next_state == self.goal
        return next_state, reward, done

In [66]:
# 价值迭代算法
def value_iteration(grid_world, threshold=1e-4):
    # 初始化状态价值函数
    V = np.zeros((grid_world.size, grid_world.size))
    delta = float('inf')

    while delta > threshold:
        delta = 0
        for i in range(grid_world.size):
            for j in range(grid_world.size):
                state = (i, j)
                if state == grid_world.goal:
                    continue  # 终止状态的价值为0，无需更新
                v = V[state]
                # 计算每个动作的期望价值
                action_values = []
                for action in grid_world.actions:
                    next_state, reward, _ = grid_world.step(state, action)
                    action_values.append(reward + grid_world.discount * V[next_state])
                V[state] = max(action_values)
                delta = max(delta, abs(v - V[state]))
    
    # 从价值函数中提取最优策略
    policy = np.zeros((grid_world.size, grid_world.size), dtype=int)
    for i in range(grid_world.size):
        for j in range(grid_world.size):
            state = (i, j)
            if state == grid_world.goal:
                continue
            action_values = []
            for action in grid_world.actions:
                next_state, reward, _ = grid_world.step(state, action)
                action_values.append(reward + grid_world.discount * V[next_state])
            print(i, j, action_values)
            policy[state] = np.argmax(action_values)

    return V, policy


In [67]:
grid_world = GridWorld(goal=(2, 3))
V, policy = value_iteration(grid_world)

print("状态价值函数 V:")
print(V)

print("\n最优策略（动作索引）:")
print(policy)

# 将动作索引转换为可读的动作
action_names = {0: "→", 1: "←", 2: "↓", 3: "↑"}
policy_readable = np.vectorize(action_names.get)(policy)
print("\n最优策略（可读形式）:")
print(policy_readable)

0 0 [0.6561000000000001, 0.5904900000000002, 0.6561000000000001, 0.5904900000000002]
0 1 [0.7290000000000001, 0.5904900000000002, 0.7290000000000001, 0.6561000000000001]
0 2 [0.81, 0.6561000000000001, 0.81, 0.7290000000000001]
0 3 [0.81, 0.7290000000000001, 0.9, 0.81]
1 0 [0.7290000000000001, 0.6561000000000001, 0.7290000000000001, 0.5904900000000002]
1 1 [0.81, 0.6561000000000001, 0.81, 0.6561000000000001]
1 2 [0.9, 0.7290000000000001, 0.9, 0.7290000000000001]
1 3 [0.9, 0.81, 1.0, 0.81]
2 0 [0.81, 0.7290000000000001, 0.6561000000000001, 0.6561000000000001]
2 1 [0.9, 0.7290000000000001, 0.7290000000000001, 0.7290000000000001]
2 2 [1.0, 0.81, 0.81, 0.81]
3 0 [0.7290000000000001, 0.6561000000000001, 0.6561000000000001, 0.7290000000000001]
3 1 [0.81, 0.6561000000000001, 0.7290000000000001, 0.81]
3 2 [0.9, 0.7290000000000001, 0.81, 0.9]
3 3 [0.9, 0.81, 0.9, 1.0]
状态价值函数 V:
[[0.6561 0.729  0.81   0.9   ]
 [0.729  0.81   0.9    1.    ]
 [0.81   0.9    1.     0.    ]
 [0.729  0.81   0.9    1. 

In [68]:
V

array([[0.6561, 0.729 , 0.81  , 0.9   ],
       [0.729 , 0.81  , 0.9   , 1.    ],
       [0.81  , 0.9   , 1.    , 0.    ],
       [0.729 , 0.81  , 0.9   , 1.    ]])

In [62]:
V.shape

(4, 4)

In [63]:
V[0]

array([0.6561, 0.729 , 0.81  , 0.9   ])

In [64]:
V[:, -1]

array([0.9, 1. , 0. , 1. ])

In [65]:
np.argmax([0, 2, 3, 2])

2