#### 数值上证明蒙特卡洛误差分解为若干个时序差分误差的和

In [2]:
import numpy as np

In [7]:
def generate_episode(T=5, reward_scale=1.0, seed=None):
    """
    随机生成一个回合的奖励序列。这里假设到 T 步就结束，简化处理。
    """
    if seed is not None:
        np.random.seed(seed)
    # 生成 T 步的奖励序列
    rewards = np.random.randn(T) * reward_scale
    return rewards


In [11]:
def random_value_function(T=5, seed=None):
    """
    随机生成一个用于对比的价值函数（长度 T + 1，最后一个视作终止状态时的价值为 0）。
    """
    if seed is not None:
        np.random.seed(seed)
    # 对于简化，这里假设 V 有 T+1 个“状态”（索引 0~T），最后一个状态价值设为 0
    V = np.random.randn(T+1)
    V[-1] = 0.0  # 终止状态价值
    return V

In [16]:
T = 5
gamma = 0.9
seed = 42
rewards = generate_episode(T, reward_scale=1.0, seed=seed)
V = random_value_function(T, seed=seed)
print("rewards: ", rewards)
print("V: ", V)

rewards:  [ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337]
V:  [ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337  0.        ]


1. 计算蒙特卡洛误差


In [18]:
G = 0.0
for k, r in enumerate(rewards):
    G += (gamma ** k) * r

mc_error = G - V[0]
print(f"蒙特卡洛误差: {mc_error}")

蒙特卡洛误差: 1.3568505809729647


2. 计算TD误差


In [19]:
td_err_sum = 0.0
for k, r in enumerate(rewards):
    td_error  = r + gamma * V[k+1] - V[k]
    td_err_sum += td_error

print(f"时序差分误差: {td_err_sum}")


时序差分误差: 1.6184706467527779


In [20]:
print("difference: ", mc_error - td_err_sum)

difference:  -0.2616200657798131
