# 作业二：实现Word2Vec的连续词袋模型
姓名：
学号：

In [3]:
# !pip install numpy tqdm

需要Python版本大于等于3.6，并检查是否已安装所依赖的第三方库。（若没有安装可以执行上面的代码块）

In [4]:
import importlib
import sys

assert sys.version_info[0] == 3
assert sys.version_info[1] >= 6

requirements = ["numpy", "tqdm"]
_OK = True

for name in requirements:
    try:
        importlib.import_module(name)
    except ImportError:
        print(f"Require: {name}")
        _OK = False

if not _OK:
    exit(-1)
else:
    print("All libraries are satisfied.")


All libraries are satisfied.


## 辅助代码

该部分包含：用于给句子分词的分词器`tokenizer`、用于构造数据的数据集类`Dataset`和用于构建词表的词表类`Vocab`。

> 注: 该部分无需实现。

### 分词器

该分词器会：
1. 将所有字母转为小写；
2. 将句子分为连续的字母序列（word）

In [5]:
import re
from typing import List


def tokenizer(line: str) -> List[str]:
    line = line.lower()
    tokens = list(filter(lambda x: len(x) > 0, re.split(r"\W", line)))
    return tokens


print(tokenizer("It's  useful. "))


['it', 's', 'useful']


### 数据集类

通过设定窗长`window_size`，该数据集类会读取`corpus`中的行，并解析返回`(context, target)`元组。

假如一个句子序列为`a b c d e`，且此时`window_size=2`，`Dataset`会返回：

```
([b, c], a)
([a, c, d], b)
([a, b, d, e], c)
([b, c, e], d)
([c, d], e)
```

In [6]:
class Dataset:
    def __init__(self, corpus: str, window_size: int):
        """
        :param corpus: 语料路径
        :param window_size: 窗口长度
        """
        self.corpus = corpus
        self.window_size = window_size

    def __iter__(self):
        with open(self.corpus, encoding="utf8") as f:
            for line in f:
                tokens = tokenizer(line)
                if len(tokens) <= 1:
                    continue
                for i, target in enumerate(tokens):
                    left_context = tokens[max(0, i - self.window_size): i]
                    right_context = tokens[i + 1: i + 1 + self.window_size]
                    context = left_context + right_context
                    yield context, target

    def __len__(self):
        """ 统计样本语料中的样本个数 """
        len_ = getattr(self, "len_", None)
        if len_ is not None:
            return len_

        len_ = 0
        for _ in iter(self):
            len_ += 1

        setattr(self, "len_", len_)
        return len_


In [7]:
debug_dataset = Dataset("./data/debug.txt", window_size=3)
print(len(debug_dataset))

for i, pair in enumerate(iter(debug_dataset)):
    print(pair)
    if i >= 10:
        break

del debug_dataset

50
(['want', 'to', 'go'], 'i')
(['i', 'to', 'go', 'home'], 'want')
(['i', 'want', 'go', 'home'], 'to')
(['i', 'want', 'to', 'home'], 'go')
(['want', 'to', 'go'], 'home')
(['want', 'to', 'play'], 'i')
(['i', 'to', 'play'], 'want')
(['i', 'want', 'play'], 'to')
(['i', 'want', 'to'], 'play')
(['like', 'eating'], 'i')
(['i', 'eating'], 'like')


### 词表类

`Vocab`可以用`token_to_idx`把token(str)映射为索引(int)，也可以用`idx_to_token`找到索引对应的token。

实例化`Vocab`有两种方法：
1. 读取`corpus`构建词表。
2. 通过调用`Vocab.load_vocab`，可以从已训练的中构建`Vocab`实例。

In [8]:
import os
import warnings
from collections import Counter
from typing import Dict, Tuple


class Vocab:
    VOCAB_FILE = "vocab.txt"
    UNK = "<unk>"

    def __init__(self, corpus: str = None, max_vocab_size: int = -1):
        """
        :param corpus:         语料文件路径
        :param max_vocab_size: 最大词表数量，-1表示不做任何限制
        """
        self._token_to_idx: Dict[str, int] = {}
        self.token_freq: List[Tuple[str, int]] = []

        if corpus is not None:
            self.build_vocab(corpus, max_vocab_size)

    def build_vocab(self, corpus: str, max_vocab_size: int = -1):
        """ 统计词频，并保留高频词 """
        counter = Counter()
        with open(corpus, encoding="utf8") as f:
            for line in f:
                tokens = tokenizer(line)
                counter.update(tokens)

        print(f"总Token数: {sum(counter.values())}")

        # 将找到的词按照词频从高到低排序
        self.token_freq = [(self.UNK, 1)] + sorted(counter.items(),
                                                   key=lambda x: x[1], reverse=True)
        if max_vocab_size > 0:
            self.token_freq = self.token_freq[:max_vocab_size]

        print(f"词表大小: {len(self.token_freq)}")

        for i, (token, _freq) in enumerate(self.token_freq):
            self._token_to_idx[token] = i

    def __len__(self):
        return len(self.token_freq)

    def __contains__(self, token: str):
        return token in self._token_to_idx

    def token_to_idx(self, token: str, warn: bool = False) -> int:
        """ Map the token to index """
        token = token.lower()
        if token not in self._token_to_idx:
            if warn:
                warnings.warn(f"{token} => {self.UNK}")
            token = self.UNK
        return self._token_to_idx[token]

    def idx_to_token(self, idx: int) -> str:
        """ Map the index to token """
        assert 0 <= idx < len(self)
        return self.token_freq[idx][0]

    def save_vocab(self, path: str):
        with open(os.path.join(path, self.VOCAB_FILE), "w", encoding="utf8") as f:
            lines = [f"{token} {freq}" for token, freq in self.token_freq]
            f.write("\n".join(lines))

    @classmethod
    def load_vocab(cls, path: str):
        vocab = cls()

        with open(os.path.join(path, cls.VOCAB_FILE), encoding="utf8") as f:
            lines = f.read().split("\n")

        for i, line in enumerate(lines):
            token, freq = line.split()
            vocab.token_freq.append((token, int(freq)))
            vocab._token_to_idx[token] = i

        return vocab


In [9]:
debug_vocab = Vocab("./data/debug.txt")
print(debug_vocab.token_freq)
del debug_vocab


总Token数: 50
词表大小: 21
[('<unk>', 1), ('want', 6), ('to', 6), ('go', 4), ('i', 3), ('home', 3), ('play', 3), ('like', 3), ('eating', 3), ('he', 3), ('she', 3), ('it', 2), ('is', 2), ('we', 2), ('useful', 1), ('awful', 1), ('can', 1), ('read', 1), ('books', 1), ('will', 1), ('now', 1)]


## Word2Vec实现

本节将实现Word2Vec的CBOW模型，为了便于实现，本实验不引入`Hierarchical Softmax`和` Negative Sampling`等加速技巧，若同学们对这些技术感兴趣，可参考：[word2vec Parameter Learning Explained](https://arxiv.org/pdf/1411.2738.pdf)。

TODO: 实现one-hot向量构建函数(1分)

需求：指定词向量的维度和需要置1的索引，返回类型为`np.ndarray`的one-hot行向量。

In [10]:
import numpy as np


def one_hot(dim: int, idx: int) -> np.ndarray:
    # TODO: 实现one-hot函数（1分）
    zero = np.zeros(dim)
    zero[idx] = 1
    return zero


print(one_hot(4, 1))
print(one_hot(10, 0))
print(one_hot(5, 4))

[0. 1. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1.]


TODO：实现softmax(2分)

> 注意数值溢出的可能

In [11]:
def softmax(x: np.ndarray) -> np.ndarray:
    # TODO: 实现softmax函数（2分）
    _max = np.max(x)
    x -= _max
    exp_sum = np.sum(np.exp(x))
    return np.exp(x) / exp_sum


print(softmax(np.array([i for i in range(10)])))

# 验证上面的 softmax 结果
import torch
import torch.nn.functional as F
x = torch.tensor([i for i in range(10)], dtype=torch.float32)
print("ground truth:", F.softmax(x, dim=0))

[7.80134161e-05 2.12062451e-04 5.76445508e-04 1.56694135e-03
 4.25938820e-03 1.15782175e-02 3.14728583e-02 8.55520989e-02
 2.32554716e-01 6.32149258e-01]


gt: tensor([7.8013e-05, 2.1206e-04, 5.7645e-04, 1.5669e-03, 4.2594e-03, 1.1578e-02,
        3.1473e-02, 8.5552e-02, 2.3255e-01, 6.3215e-01])


TODO：CBOW类，请补全`train_one_step`中的代码。

推荐按照TODO描述的步骤来实现（预计15行代码），也可在保证结果正确的前提下按照自己的思路来实现。

> tips: 建议使用numpy的向量化操作代替Python循环。
> 比如同样是实现两个向量`a`和`b`的内积，`np.dot(a,b)`的运行效率可达纯Python实现的函数的百倍以上。同样的，向量外积也推荐使用`np.outer(a,b)`。具体的函数功能可参考Numpy文档。

In [59]:
import os
import pickle
import time

from tqdm import tqdm


class CBOW:
    def __init__(self, vocab: Vocab, vector_dim: int):
        self.vocab = vocab
        self.vector_dim = vector_dim

        self.U = np.random.uniform(-1, 1, (len(self.vocab), self.vector_dim))  # vocab_size x vector_dim
        self.V = np.random.uniform(-1, 1, (self.vector_dim, len(self.vocab)))  # vector_dim x vocab_size

    def train(self, corpus: str, window_size: int, train_epoch: int, learning_rate: float, save_path: str = None):
        dataset = Dataset(corpus, window_size)
        start_time = time.time()

        for epoch in range(1, train_epoch + 1):
            self.train_one_epoch(epoch, dataset, learning_rate)
            if save_path is not None:
                self.save_model(save_path)

        end_time = time.time()
        print(f"总耗时 {end_time - start_time:.2f}s")

    def train_one_epoch(self, epoch: int, dataset: Dataset, learning_rate: float):
        steps, total_loss = 0, 0.0

        with tqdm(iter(dataset), total=len(dataset), desc=f"Epoch {epoch}", ncols=80) as pbar:
            for sample in pbar:
                # sample 就是最开始定义的 __iter__ 返回的元组
                # 第一位是前后文, 第二位是目标词
                context_tokens, target_token = sample
                loss = self.train_one_step(context_tokens, target_token, learning_rate)
                total_loss += loss
                steps += 1
                if steps % 10 == 0:
                    pbar.set_postfix({"Avg. loss": f"{total_loss / steps:.2f}"})

        return total_loss / steps

    def train_one_step(self, context_tokens: List[str], target_token: str, learning_rate: float) -> float:
        """
        :param context_tokens:  目标词周围的词
        :param target_token:    目标词
        :param learning_rate:   学习率
        :return:    loss值 (标量)
        """
        C = len(context_tokens)

        # TODO: 构造输入向量和目标向量（3分）
        # context: 构造输入向量 (8,)
        # target:  目标one-hot向量 (21,)
        context_indices = [self.vocab.token_to_idx(token) for token in context_tokens]
        context = np.mean(self.U[context_indices], axis=0)
        target = one_hot(len(self.vocab), self.vocab.token_to_idx(target_token))

        # TODO: 前向步骤（3分）
        # U.shape=(21, 8), V.shape=(8, 21)
        out = np.dot(self.V.T, context)
        y = softmax(out)

        # TODO: 计算loss（3分）
        loss = -np.log(y[self.vocab.token_to_idx(target_token)])

        # TODO: 更新参数（3分）
        e = y - target  # e 是 (21,)
        dV = np.outer(context, e)  # dV 是 (8, 21)
        dcontext = np.dot(self.V, e)  # dcontext 是 (8,)
        ### 注意是前面先算完所有梯度再更新
        self.U[context_indices] -= learning_rate * (dcontext / C)
        self.V -= learning_rate * dV

        return loss

    def similarity(self, token1: str, token2: str):
        """ 计算两个词的相似性 """
        v1 = self.U[self.vocab.token_to_idx(token1)]
        v2 = self.U[self.vocab.token_to_idx(token2)]
        v1 = v1 / np.linalg.norm(v1)
        v2 = v2 / np.linalg.norm(v2)
        return np.dot(v1, v2)

    def most_similar_tokens(self, token: str, n: int):
        """ 召回与token最相似的n个token """
        norm_U = self.U / np.linalg.norm(self.U, axis=1, keepdims=True)

        idx = self.vocab.token_to_idx(token, warn=True)
        v = norm_U[idx]

        cosine_similarity = np.dot(norm_U, v)
        nbest_idx = np.argsort(cosine_similarity)[-n:][::-1]

        results = []
        for idx in nbest_idx:
            _token = self.vocab.idx_to_token(idx)
            results.append((_token, cosine_similarity[idx]))

        return results

    def save_model(self, path: str):
        """ 将模型保存到`path`路径下，如果不存在`path`会主动创建 """
        os.makedirs(path, exist_ok=True)
        self.vocab.save_vocab(path)

        with open(os.path.join(path, "wv.pkl"), "wb") as f:
            param = {"U": self.U, "V": self.V}
            pickle.dump(param, f)

    @classmethod
    def load_model(cls, path: str):
        """ 从`path`加载模型 """
        vocab = Vocab.load_vocab(path)

        with open(os.path.join(path, "wv.pkl"), "rb") as f:
            param = pickle.load(f)

        U, V = param["U"], param["V"]
        model = cls(vocab, U.shape[1])
        model.U, model.V = U, V

        return model


## 测试

测试部分可用于验证CBOW实现的正确性，此部分的结果不计入总分。

### 测试1

本测试可用于调试，最终一个epoch的平均loss约为0.5，并且“i”、“he”和“she”的相似性较高。

In [60]:
import random


def test1():
    random.seed(42)
    np.random.seed(42)

    vocab = Vocab(corpus="./data/debug.txt")
    cbow = CBOW(vocab, vector_dim=8)
    cbow.train(corpus="./data/debug.txt", window_size=3,
               train_epoch=10, learning_rate=1.0)

    print(cbow.most_similar_tokens("i", 5))
    print(cbow.most_similar_tokens("he", 5))
    print(cbow.most_similar_tokens("she", 5))


test1()


总Token数: 50
词表大小: 21


Epoch 1: 100%|████████████████| 50/50 [00:00<00:00, 8334.93it/s, Avg. loss=2.89]
Epoch 2: 100%|████████████████| 50/50 [00:00<00:00, 6245.61it/s, Avg. loss=1.54]
Epoch 3: 100%|████████████████| 50/50 [00:00<00:00, 8282.92it/s, Avg. loss=1.05]
Epoch 4: 100%|████████████████| 50/50 [00:00<00:00, 8333.28it/s, Avg. loss=0.82]
Epoch 5: 100%|████████████████| 50/50 [00:00<00:00, 8333.61it/s, Avg. loss=0.76]
Epoch 6: 100%|████████████████| 50/50 [00:00<00:00, 8331.62it/s, Avg. loss=0.67]
Epoch 7: 100%|████████████████| 50/50 [00:00<00:00, 6251.20it/s, Avg. loss=0.53]
Epoch 8: 100%|████████████████| 50/50 [00:00<00:00, 5000.12it/s, Avg. loss=0.54]
Epoch 9: 100%|████████████████| 50/50 [00:00<00:00, 5543.33it/s, Avg. loss=0.52]
Epoch 10: 100%|███████████████| 50/50 [00:00<00:00, 6251.20it/s, Avg. loss=0.50]

总耗时 0.09s
[('i', 0.9999999999999999), ('he', 0.9925540605382073), ('she', 0.966337856762682), ('<unk>', 0.635699270231461), ('is', 0.3974123537637737)]
[('he', 0.9999999999999999), ('i', 0.9925540605382073), ('she', 0.98580084006031), ('<unk>', 0.6171017925293523), ('is', 0.35823278721958063)]
[('she', 1.0), ('he', 0.98580084006031), ('i', 0.966337856762682), ('<unk>', 0.501227926206575), ('is', 0.38698246680231174)]





### 测试2

本测试将会在`treebank.txt`上训练词向量，为了加快训练流程，实验只保留高频的4000词，且词向量维度为20。

在每个epoch结束后，会在`data/treebank.txt`中测试词向量的召回能力。如下所示，`data/treebank.txt`中每个样例为`word`以及对应的同义词，同义词从wordnet中获取。

```
[
  "about",
  [
    "most",
    "virtually",
    "around",
    "almost",
    "near",
    "nearly",
    "some"
  ]
]
```

> 本阶段预计消耗25分钟，具体时间与`train_one_step`代码实现有关

> 最后一个epoch平均loss降至5.1左右，并且在同义词上的召回率约为20%左右

In [61]:
import json
import time


def calculate_recall_rate(model: CBOW, word_synonyms: List[Tuple[str, List[str]]], topn: int) -> float:
    """ 测试CBOW的召回率 """
    hit, total = 0, 1e-9
    for word, synonyms in word_synonyms:
        synonyms = set(synonyms)
        recalled = set([w for w, _ in model.most_similar_tokens(word, topn)])
        hit += len(synonyms & recalled)
        total += len(synonyms)

    print(f"Recall rate: {hit / total:.2%}")
    return hit / total


def test2():
    random.seed(42)
    np.random.seed(42)

    corpus = "./data/treebank.txt"
    lr = 1e-1
    topn = 40

    vocab = Vocab(corpus, max_vocab_size=4000)
    model = CBOW(vocab, vector_dim=20)

    dataset = Dataset(corpus, window_size=4)

    with open("data/synonyms.json", encoding="utf8") as f:
        word_synonyms: List[Tuple[str, List[str]]] = json.load(f)

    for epoch in range(1, 11):
        model.train_one_epoch(epoch, dataset, learning_rate=lr)
        calculate_recall_rate(model, word_synonyms, topn)

start = time.time()
test2()
end = time.time()
print("Total time:", round(end-start, 3))


总Token数: 205068
词表大小: 4000


Epoch 1: 100%|████████| 205058/205058 [02:11<00:00, 1554.99it/s, Avg. loss=5.99]


Recall rate: 8.28%


Epoch 2: 100%|█████████| 205058/205058 [03:45<00:00, 907.54it/s, Avg. loss=5.59]


Recall rate: 12.43%


Epoch 3: 100%|████████| 205058/205058 [03:05<00:00, 1105.81it/s, Avg. loss=5.44]


Recall rate: 13.61%


Epoch 4: 100%|████████| 205058/205058 [02:47<00:00, 1223.87it/s, Avg. loss=5.34]


Recall rate: 15.98%


Epoch 5: 100%|████████| 205058/205058 [02:33<00:00, 1331.80it/s, Avg. loss=5.26]


Recall rate: 16.57%


Epoch 6: 100%|████████| 205058/205058 [02:42<00:00, 1258.04it/s, Avg. loss=5.20]


Recall rate: 18.93%


Epoch 7: 100%|████████| 205058/205058 [02:46<00:00, 1231.84it/s, Avg. loss=5.15]


Recall rate: 19.82%


Epoch 8: 100%|████████| 205058/205058 [02:50<00:00, 1205.47it/s, Avg. loss=5.11]


Recall rate: 20.12%


Epoch 9: 100%|████████| 205058/205058 [02:46<00:00, 1233.37it/s, Avg. loss=5.08]


Recall rate: 19.82%


Epoch 10: 100%|███████| 205058/205058 [02:35<00:00, 1321.10it/s, Avg. loss=5.05]


Recall rate: 19.82%
Total time: 1686.635


**实验总结**

**CBOW 的训练过程**

在每轮训练中，遍历数据集，对于每一组 `sample` 都会进行 `train_one_step` 来计算这一组的 loss，并更新模型参数

首先计算 `context_indices = [self.vocab.token_to_idx(token) for token in context_tokens]`，是输入上下文的索引位置，`self.U[context_indices]` 能方便取出对应位置的权重，用来构建输入和更新参数。输入是取 `context_indices` 上 U 的参数的直接平均

`target` 是目标词的单热向量，把 `token_to_idx(target_tokne)` 这个位置传入 `one_hot` 即可。由于这个ground truth只有一个位置非0，计算交叉熵损失时只需要计算这个位置的预测值负对数。

前向步骤理论上为 $VUx$，在这里已经通过 `context_indices` 取出了 $U$ 中对应索引的权重，所以实际 `context` 对应计算好的 $Ux$。则预测值为 `np.dot(V.T, context)`，再经过 `softmax` 转化成概率向量，和 `target` 计算交叉熵损失（由于 ground truth 是单个位置为1的全0向量，可以直接计算该位置的负对数）

下面计算梯度反向传播。由于本质上 $Ux$ 的结果已经由 `U[context_indices]` 给出，也就是 `context`，所以 $\cfrac{\partial E}{\partial V}=eh^\top=$ `np.outer(context, e)`；$\cfrac{\partial E}{\partial U}=V^\top e x^\top=$ `np.dot(V, e)`，由于 `e` 已经包含了哪些选择的 `x` 位置信息，更新 U 的时候只需要更新 `U[context_indices]` 即可


**实验结果**

在下面的实验中，展示了不同学习率的训练效果（固定训练轮数 10 轮），参数选取情形包括 $0.1,~0.01,~0.001,~0.0001$，以及 $0.1\rightarrow 0.001$ 的衰减学习率（每两个Epoch除以2）

其中 $\alpha\leqslant 1\times10^{-2}$ 的效果都显著不足，下面只选取了其中 $0.01$ 的情形，其余没有绘制在下面的折线图中，省略的 loss 和 recall rate 展示在附录的训练日志中。

* Loss-Epoch

    ![](https://s21.ax1x.com/2024/11/29/pA5R0lF.png)

* Recall Rate-Epoch

    ![](https://s21.ax1x.com/2024/11/29/pA5RwSU.png)

* $\alpha=0.01$ 的长 epoch 训练效果（30epoch）

    可见从 10 轮开始，recall rate就已经稳定在 20%~21% 之间，loss则极缓慢地下降，说明从这里开始已经没有收敛空间了。但考虑到在前面实验中，后面几轮衰减 $\alpha$ 也没有足够的提升，不妨可以猜想 20%~21% 的 recall rate 是这个模型参数量在这个训练集上的极限了。
    
    ![](https://s21.ax1x.com/2024/11/30/pA5btRs.png)

**附录：日志信息**

> 部分实验经过 torch 重写，torch 的所有相关 SEED 也设为 42

1. `lr` 逐轮下降（每两个epoch除以2）
    ```
    Epoch 1 (lr=0.1): 100%|███████████████████| 205058/205058 [01:19<00:00, 2567.64it/s, Avg. loss=5.99]
    Recall rate: 7.40%
    Epoch 2 (lr=0.05): 100%|██████████████████| 205058/205058 [01:20<00:00, 2560.30it/s, Avg. loss=5.58]
    Recall rate: 9.17%
    Epoch 3 (lr=0.05): 100%|██████████████████| 205058/205058 [01:20<00:00, 2549.98it/s, Avg. loss=5.47]
    Recall rate: 9.47%
    Epoch 4 (lr=0.025): 100%|█████████████████| 205058/205058 [01:20<00:00, 2557.60it/s, Avg. loss=5.38]
    Recall rate: 10.65%
    Epoch 5 (lr=0.025): 100%|█████████████████| 205058/205058 [01:19<00:00, 2574.88it/s, Avg. loss=5.34]
    Recall rate: 10.65%
    Epoch 6 (lr=0.0125): 100%|████████████████| 205058/205058 [01:19<00:00, 2579.14it/s, Avg. loss=5.29]
    Recall rate: 10.95%
    Epoch 7 (lr=0.0125): 100%|████████████████| 205058/205058 [01:19<00:00, 2583.04it/s, Avg. loss=5.28]
    Recall rate: 11.83%
    Epoch 8 (lr=0.00625): 100%|███████████████| 205058/205058 [01:19<00:00, 2572.64it/s, Avg. loss=5.26]
    Recall rate: 11.24%
    Epoch 9 (lr=0.00625): 100%|███████████████| 205058/205058 [01:19<00:00, 2577.23it/s, Avg. loss=5.25]
    Recall rate: 11.83%
    Epoch 10 (lr=0.003125): 100%|█████████████| 205058/205058 [01:19<00:00, 2579.53it/s, Avg. loss=5.24]
    Recall rate: 11.54%
    Total time: 800.002
    ```
2. `lr=1e-1`
    ```
    Epoch 1: 100%|████████| 205058/205058 [02:14<00:00, 1529.93it/s, Avg. loss=5.99]
    Recall rate: 8.28%
    Epoch 2: 100%|████████| 205058/205058 [02:41<00:00, 1273.33it/s, Avg. loss=5.59]
    Recall rate: 12.43%
    Epoch 3: 100%|████████| 205058/205058 [02:34<00:00, 1326.35it/s, Avg. loss=5.44]
    Recall rate: 13.61%
    Epoch 4: 100%|████████| 205058/205058 [02:45<00:00, 1238.54it/s, Avg. loss=5.34]
    Recall rate: 15.98%
    Epoch 5: 100%|████████| 205058/205058 [02:35<00:00, 1321.23it/s, Avg. loss=5.26]
    Recall rate: 16.57%
    Epoch 6: 100%|████████| 205058/205058 [02:24<00:00, 1420.50it/s, Avg. loss=5.20]
    Recall rate: 18.93%
    Epoch 7: 100%|████████| 205058/205058 [02:56<00:00, 1161.47it/s, Avg. loss=5.15]
    Recall rate: 19.82%
    Epoch 8: 100%|████████| 205058/205058 [02:29<00:00, 1375.01it/s, Avg. loss=5.11]
    Recall rate: 20.12%
    Epoch 9: 100%|████████| 205058/205058 [02:23<00:00, 1431.64it/s, Avg. loss=5.08]
    Recall rate: 19.82%
    Epoch 10: 100%|███████| 205058/205058 [02:48<00:00, 1218.77it/s, Avg. loss=5.05]
    Recall rate: 19.82%
    Total time: 1552.772
    ```
3. `lr=1e-2`
    ```
    Epoch 1: 100%|████████| 205058/205058 [02:24<00:00, 1417.90it/s, Avg. loss=6.63]
    Recall rate: 3.55%
    Epoch 2: 100%|████████| 205058/205058 [02:18<00:00, 1484.17it/s, Avg. loss=6.13]
    Recall rate: 3.85%
    Epoch 3: 100%|████████| 205058/205058 [02:32<00:00, 1343.80it/s, Avg. loss=5.99]
    Recall rate: 4.44%
    Epoch 4: 100%|████████| 205058/205058 [02:31<00:00, 1350.84it/s, Avg. loss=5.89]
    Recall rate: 4.73%
    Epoch 5: 100%|████████| 205058/205058 [02:23<00:00, 1433.64it/s, Avg. loss=5.82]
    Recall rate: 5.62%
    Epoch 6: 100%|████████| 205058/205058 [02:23<00:00, 1431.04it/s, Avg. loss=5.76]
    Recall rate: 7.10%
    Epoch 7: 100%|████████| 205058/205058 [02:14<00:00, 1529.85it/s, Avg. loss=5.71]
    Recall rate: 7.10%
    Epoch 8: 100%|████████| 205058/205058 [02:12<00:00, 1546.11it/s, Avg. loss=5.66]
    Recall rate: 7.10%
    Epoch 9: 100%|████████| 205058/205058 [02:11<00:00, 1563.37it/s, Avg. loss=5.62]
    Recall rate: 7.99%
    Epoch 10: 100%|███████| 205058/205058 [02:11<00:00, 1564.87it/s, Avg. loss=5.59]
    Recall rate: 8.28%
    Total time: 1403.121
    ```
4. `lr=1e-3`
   ```
    Epoch 1 (lr=0.001): 100%|█████████████████| 205058/205058 [01:20<00:00, 2532.15it/s, Avg. loss=7.76]
    Recall rate: 1.18%
    Epoch 2 (lr=0.001): 100%|█████████████████| 205058/205058 [01:20<00:00, 2548.52it/s, Avg. loss=7.01]
    Recall rate: 1.18%
    Epoch 3 (lr=0.001): 100%|█████████████████| 205058/205058 [01:20<00:00, 2537.13it/s, Avg. loss=6.75]
    Recall rate: 1.18%
    Epoch 4 (lr=0.001): 100%|█████████████████| 205058/205058 [01:20<00:00, 2539.97it/s, Avg. loss=6.59]
    Recall rate: 1.18%
    Epoch 5 (lr=0.001): 100%|█████████████████| 205058/205058 [01:21<00:00, 2529.76it/s, Avg. loss=6.49]
    Recall rate: 1.48%
    Epoch 6 (lr=0.001): 100%|█████████████████| 205058/205058 [01:21<00:00, 2521.30it/s, Avg. loss=6.41]
    Recall rate: 1.48%
    Epoch 7 (lr=0.001): 100%|█████████████████| 205058/205058 [01:21<00:00, 2502.41it/s, Avg. loss=6.36]
    Recall rate: 1.78%
    Epoch 8 (lr=0.001): 100%|█████████████████| 205058/205058 [01:20<00:00, 2535.05it/s, Avg. loss=6.31]
    Recall rate: 2.07%
    Epoch 9 (lr=0.001): 100%|█████████████████| 205058/205058 [01:20<00:00, 2537.18it/s, Avg. loss=6.27]
    Recall rate: 2.07%
    Epoch 10 (lr=0.001): 100%|████████████████| 205058/205058 [01:20<00:00, 2537.86it/s, Avg. loss=6.24]
    Recall rate: 2.37%
    Total time: 811.283
   ```
5. `lr=1e-1` 30 epoch 训练，到后面发生震荡，几乎不再进一步收敛
    ```
    Epoch 1 (lr=0.1): 100%|███████████████████| 205058/205058 [01:21<00:00, 2500.74it/s, Avg. loss=5.99]
    Recall rate: 7.40%
    Epoch 2 (lr=0.1): 100%|███████████████████| 205058/205058 [01:21<00:00, 2507.55it/s, Avg. loss=5.59]
    Recall rate: 10.06%
    Epoch 3 (lr=0.1): 100%|███████████████████| 205058/205058 [01:23<00:00, 2467.67it/s, Avg. loss=5.44]
    Recall rate: 11.83%
    Epoch 4 (lr=0.1): 100%|███████████████████| 205058/205058 [01:21<00:00, 2516.63it/s, Avg. loss=5.34]
    Recall rate: 13.31%
    Epoch 5 (lr=0.1): 100%|███████████████████| 205058/205058 [01:21<00:00, 2526.68it/s, Avg. loss=5.26]
    Recall rate: 17.16%
    Epoch 6 (lr=0.1): 100%|███████████████████| 205058/205058 [01:20<00:00, 2541.01it/s, Avg. loss=5.20]
    Recall rate: 18.64%
    Epoch 7 (lr=0.1): 100%|███████████████████| 205058/205058 [01:20<00:00, 2535.92it/s, Avg. loss=5.15]
    Recall rate: 18.64%
    Epoch 8 (lr=0.1): 100%|███████████████████| 205058/205058 [01:21<00:00, 2508.93it/s, Avg. loss=5.11]
    Recall rate: 18.93%
    Epoch 9 (lr=0.1): 100%|███████████████████| 205058/205058 [01:21<00:00, 2524.33it/s, Avg. loss=5.07]
    Recall rate: 19.82%
    Epoch 10 (lr=0.1): 100%|██████████████████| 205058/205058 [01:21<00:00, 2513.85it/s, Avg. loss=5.04]
    Recall rate: 18.93%
    Epoch 11 (lr=0.1): 100%|██████████████████| 205058/205058 [01:22<00:00, 2489.61it/s, Avg. loss=5.02]
    Recall rate: 19.23%
    Epoch 12 (lr=0.1): 100%|██████████████████| 205058/205058 [01:22<00:00, 2500.58it/s, Avg. loss=4.99]
    Recall rate: 20.12%
    Epoch 13 (lr=0.1): 100%|██████████████████| 205058/205058 [01:21<00:00, 2510.05it/s, Avg. loss=4.97]
    Recall rate: 20.12%
    Epoch 14 (lr=0.1): 100%|██████████████████| 205058/205058 [01:21<00:00, 2521.31it/s, Avg. loss=4.96]
    Recall rate: 20.12%
    Epoch 15 (lr=0.1): 100%|██████████████████| 205058/205058 [01:21<00:00, 2521.02it/s, Avg. loss=4.94]
    Recall rate: 20.12%
    Epoch 16 (lr=0.1): 100%|██████████████████| 205058/205058 [01:22<00:00, 2498.96it/s, Avg. loss=4.93]
    Recall rate: 20.41%
    Epoch 17 (lr=0.1): 100%|██████████████████| 205058/205058 [01:21<00:00, 2502.08it/s, Avg. loss=4.91]
    Recall rate: 19.82%
    Epoch 18 (lr=0.1): 100%|██████████████████| 205058/205058 [01:21<00:00, 2518.19it/s, Avg. loss=4.90]
    Recall rate: 20.41%
    Epoch 19 (lr=0.1): 100%|██████████████████| 205058/205058 [01:22<00:00, 2488.03it/s, Avg. loss=4.89]
    Recall rate: 20.41%
    Epoch 20 (lr=0.1): 100%|██████████████████| 205058/205058 [01:22<00:00, 2495.08it/s, Avg. loss=4.88]
    Recall rate: 21.01%
    Epoch 21 (lr=0.1): 100%|██████████████████| 205058/205058 [01:11<00:00, 2884.52it/s, Avg. loss=4.87]
    Recall rate: 21.01%
    Epoch 22 (lr=0.1): 100%|██████████████████| 205058/205058 [01:12<00:00, 2845.63it/s, Avg. loss=4.87]
    Recall rate: 21.60%
    Epoch 23 (lr=0.1): 100%|██████████████████| 205058/205058 [01:12<00:00, 2812.21it/s, Avg. loss=4.86]
    Recall rate: 21.60%
    Epoch 24 (lr=0.1): 100%|██████████████████| 205058/205058 [01:11<00:00, 2872.27it/s, Avg. loss=4.85]
    Recall rate: 21.30%
    Epoch 25 (lr=0.1): 100%|██████████████████| 205058/205058 [01:12<00:00, 2846.53it/s, Avg. loss=4.85]
    Recall rate: 21.30%
    Epoch 26 (lr=0.1): 100%|██████████████████| 205058/205058 [01:12<00:00, 2838.94it/s, Avg. loss=4.84]
    Recall rate: 21.01%
    Epoch 27 (lr=0.1): 100%|██████████████████| 205058/205058 [01:12<00:00, 2820.65it/s, Avg. loss=4.84]
    Recall rate: 21.01%
    Epoch 28 (lr=0.1): 100%|██████████████████| 205058/205058 [01:13<00:00, 2803.21it/s, Avg. loss=4.83]
    Recall rate: 20.71%
    Epoch 29 (lr=0.1): 100%|██████████████████| 205058/205058 [01:13<00:00, 2795.12it/s, Avg. loss=4.83]
    Recall rate: 20.71%
    Epoch 30 (lr=0.1): 100%|██████████████████| 205058/205058 [01:12<00:00, 2838.62it/s, Avg. loss=4.82]
    Recall rate: 21.01%
    ```