#### DQN

* 深度Q网络定义
* Replay buffer实现
* 目标网络实现
* Q值迭代
* 目标网络更新

In [1]:
! pip install "gymnasium[atari, accept-rom-license]"
# 如果用zsh console请用这个命令 pip install gymnasium\[all\]
! pip install matplotlib
! pip install numpy
! pip install torch



#### 1.引入包并注册ale

In [2]:
import copy
import random
import numpy as np

import gymnasium as gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#from replaybuffer import ReplayBuffer
import ale_py

gym.register_envs(ale_py)

#### 2.深度Q网络定义

三层感知机网络：

1. 状态空间->128；Relu
2. 128 -> 128；Relu
3. 128 -> 行为空间

In [3]:
class QNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        
        self.network = nn.Sequential(
            nn.Linear(env.observation_space.shape[0], 128), nn.ReLU(),
            nn.Linear(128, 128), nn.ReLU(),
            nn.Linear(128, env.action_space.n),
        )

    def forward(self, x):
        return self.network(x)

#### 3. 定义经验回放缓存

1. 经验回放缓存中存储(状态，行为，下时刻状态，奖励，回合是否结束)
2. 当经验回放缓存已满，则覆盖最早插入的数据
3. 实现随机采样的功能，并以tensor的形式返回

In [4]:
class ReplayBuffer:
	def __init__(self, state_dim, action_dim, max_size=int(1e6), device="cpu"):
		self.max_size = max_size
		self.ptr = 0
		self.size = 0

		self.state = np.zeros((max_size, state_dim))
		self.action = np.zeros((max_size, 1))
		self.next_state = np.zeros((max_size, state_dim))
		self.reward = np.zeros((max_size, 1))
		self.not_done = np.zeros((max_size, 1))

		self.device = device

	def add(self, state, action, next_state, reward, done):
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.not_done[self.ptr] = 1. - done

		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)

	def sample(self, batch_size):
		ind = np.random.randint(0, self.size, size=batch_size)

		return (
			torch.tensor(self.state[ind], device=self.device, dtype=torch.float32),
			torch.tensor(self.action[ind], device=self.device, dtype=torch.int64),
			torch.tensor(self.next_state[ind], device=self.device, dtype=torch.float32),
			torch.tensor(self.reward[ind], device=self.device, dtype=torch.float32),
			torch.tensor(self.not_done[ind], device=self.device, dtype=torch.float32)
		)

#### 4.1 定义超参
* learning_rate:深度学习学习率
* buffer_size:经验回放缓存容量
* total_timesteps:总训练步数
* epsilon:探索与利用的探索值
* gamma:折扣因子
* tau:目标网络更新幅度
* learning_start:从该时间步后开始训练
* train_frequentce: N轮回合后训练一次
* log_frequence:输出结果的频率
* target_frequence:更新目标网络的频率
* batch_size:每次训练网络的数据量

In [5]:
learning_rate = 3e-4
buffer_size = int(1e5)
total_timesteps = int(2e6)
epsilon = 0.01
gamma = 0.99
tau = 1.0

learning_starts = 10000
train_frequency = 4
log_frequency = 500
target_frequency = 1000
batch_size = 256

#### 4.2 构建环境和网络

* 定义训练卡
* 创建环境
* 定义Q值网络
* 定义目标网络
* 定义优化器
* 实例化缓存

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = gym.make("ALE/Boxing-v5", obs_type="ram")

q_network = QNetwork(env).to(device)
target_network = copy.deepcopy(q_network)
optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)

buffer = ReplayBuffer(env.observation_space.shape[0], env.action_space.n, buffer_size, device)

A.L.E: Arcade Learning Environment (version 0.11.1+2750686)
[Powered by Stella]


##### 4.3 开始训练
1. 初始化环境
2. 根据网络选择行为
3. 应用行为后环境推进
4. 将转移信息存入经验回放缓存
5. 每隔一段时间训练Q值网络
6. 每隔一段时间更新目标网络

In [7]:
obs, _ = env.reset()
total_rewards = []
total_reward = 0
for step in range(total_timesteps):

    # Epsilon greedy
    if random.random() < epsilon:
        actions = env.action_space.sample()
        
    else:

        q_values = q_network(torch.tensor(obs, device=device, dtype=torch.float32).unsqueeze(dim=0))
        actions = torch.argmax(q_values, dim=1).cpu().numpy()
        
    if type(actions) == np.ndarray:
        actions = actions.item()

    next_obs, rewards, terminations, truncations, infos = env.step(actions)
    total_reward += rewards

    buffer.add(obs, actions, next_obs, rewards, terminations)
    obs = next_obs

    if terminations or truncations:
        obs, _ = env.reset()
        total_rewards.append(total_reward)
        total_reward = 0

    # Training.
    if step > learning_starts:
        if step % train_frequency == 0:
            data = buffer.sample(batch_size)
            buffer_obs, act, next_buffer_obs, rew, cont = data
            with torch.no_grad():
                target_max = target_network(next_buffer_obs).max(dim=1, keepdim=True)[0]
                td_target = rew + gamma * target_max * cont
            old_val = q_network(buffer_obs).gather(1, act)
            loss = F.mse_loss(td_target, old_val)

            if step % log_frequency == 0:
                # wandb.log({"td_loss": loss.item(), "q_values": old_val.mean().item()}, step=step)
                print('td_loss: {}\t q_values: {}\t step: {}, avg_rewards: {}'.format(loss.item(), old_val.mean().item(), step, np.mean(total_rewards[-100:])))
            
            # optimize the model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # update target network
        if step % target_frequency == 0:
            for target_param, param in zip(target_network.parameters(), q_network.parameters()):
                target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)

td_loss: 1.5051190853118896	 q_values: 10.80508041381836	 step: 10500, avg_rewards: -11.0
td_loss: 1.134813666343689	 q_values: 10.915023803710938	 step: 11000, avg_rewards: -13.666666666666666
td_loss: 0.5540050268173218	 q_values: 11.329310417175293	 step: 11500, avg_rewards: -13.666666666666666
td_loss: 0.5722718834877014	 q_values: 11.132790565490723	 step: 12000, avg_rewards: -13.666666666666666
td_loss: 0.3582562506198883	 q_values: 11.981813430786133	 step: 12500, avg_rewards: -13.666666666666666
td_loss: 0.3659170866012573	 q_values: 11.944762229919434	 step: 13000, avg_rewards: -12.857142857142858
td_loss: 0.331169456243515	 q_values: 12.395763397216797	 step: 13500, avg_rewards: -12.857142857142858
td_loss: 0.3016524314880371	 q_values: 12.254671096801758	 step: 14000, avg_rewards: -12.857142857142858
td_loss: 0.2946300506591797	 q_values: 12.326803207397461	 step: 14500, avg_rewards: -13.625
td_loss: 0.1768348067998886	 q_values: 12.378885269165039	 step: 15000, avg_rewards:

KeyboardInterrupt: 

#### 实验五练习

1. 不修改超参和网络结构，将DQN算法修改为Double DQN
2. 不修改超参，将DQN算法修改为Dueling DQN