## 1. 测试一下工具包

In [1]:
import torch, torchtext
print(torch.__version__, torchtext.__version__)
print("cuda is available: ", torch.cuda.is_available())
from torchdata.datapipes.iter import IterableWrapper

  from .autonotebook import tqdm as notebook_tqdm


2.0.0+cu117 0.15.1+cpu
cuda is available:  True


## 2. 初始数据

1. Build data processing pipeline to convert the raw text strings into Tensor that can be used to train the model
2. Shuffle and iterate the data with DataLoader

In [2]:
import torch
from torchtext.datasets import AG_NEWS


train_iter = iter(AG_NEWS(split="train"))
# iter创建了一个迭代器对象，每次调用这个迭代器对象的__next__()方法时，
# 都会调用object, split表示使用的划分是训练集

使用时遇到了无法连接到githubusercontent的问题，解决方案为[csdn blog](https://blog.csdn.net/qq_32618327/article/details/106681211)

In [3]:
next(train_iter)

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

# 3. 为处理数据做准备

初始的文本处理方法：包括vocab，词向量，tokenizer
1. tokenizer:首先，将输入的文本按照一定规则切分成一系列的token；然后，在字典中查表，将每个token用一个整数编号来表示；最后，将字典中不存在的字（词）用特殊标识符（‘UNK’）表示，并赋予相应编号。
2. yield:是Python 的一个关键字，用于从一个函数中返回一个生成器（generator）。生成器是一种特殊类型的迭代器，它允许你延迟计算结果，这在处理大数据或者创建复杂数据结构时特别有用，因为你不需要一次性将所有的数据都存储在内存中。一个使用 yield 的函数会被称为生成器函数。这种函数并不直接返回一个值，而是生成一系列的值。每次调用这个生成器函数，它会从上次离开的地方继续执行，并且可以产生许多结果，而不是单个值

In [4]:
# use function: build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# get tokenizer
tokenizer = get_tokenizer("basic_english")
train_iter = AG_NEWS(split="train")


# define a yield list of tokens
def yield_tokens(data_iter):
    for _,text in data_iter:
        yield tokenizer(text)


# specials（List[str]，optional）：指定一些特殊的词，比如UNK、PAD、BOS和EOS等
vocab = build_vocab_from_iterator(yield_tokens(train_iter),specials=["<unk>"])

# Value of default index. This index will be returned when OOV token is queried.
vocab.set_default_index(vocab["<unk>"])


In [5]:
# vocab converts a list of tokens into integers
# 在这个文本数据集上，没有@等这些奇怪的字符，所以变为了0
vocab(['here', 'is', 'an', 'example','@', "<unk>"])

[475, 21, 30, 5297, 0, 0]

In [6]:
# 输入的x为一个句子，将其转化为数字列表
text_pipeline = lambda x: vocab(tokenizer(x))

label_pipeline = lambda x: int(x) - 1

In [25]:
# code in eda.py
from eda import eda, get_only_chars


ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."


a = text_pipeline(ex_text_str)
b_str = eda(ex_text_str)[2]
b = text_pipeline(b_str)
print(ex_text_str)
print(b_str)
torch.tensor(a), torch.tensor(b)

MEMPHIS, Tenn. – Four days ago, Jon Rahm was     enduring the season’s worst weather conditions on Sunday at The     Open on his way to a closing 75 at Royal Portrush, which     considering the wind and the rain was a respectable showing.     Thursday’s first round at the WGC-FedEx St. Jude Invitational     was another story. With temperatures in the mid-80s and hardly any     wind, the Spaniard was 13 strokes better in a flawless round.     Thanks to his best putting performance on the PGA Tour, Rahm     finished with an 8-under 62 for a three-stroke lead, which     was even more impressive considering he’d never played the     front nine at TPC Southwind.
memphis tenn four days ago jon rahm was live the seasons worst weather term on sunday at the open on his way to a closing at purple portrush which considering the wind and the rainfall was a respectable showing thursdays first round at the wgc fedex st st jude invitational was another narrative with temperatures in the mid s and har

(tensor([ 3096,     3,  6691,     1,     0,   161,   301,   349,     3,  4774,
             0,    35, 12839,     2,     0,  1598,  1424,  2056,    10,    91,
            20,     2,   149,    10,    32,   249,     4,     5,  1914,  2642,
            20,  1703, 83143,     3,   103,  1879,     2,  3030,     8,     2,
          1594,    35,     5, 19246,  1646,     1,     0,    47,   426,    20,
             2,     0,   425,     1, 24223,  5965,    35,   206,  1855,     1,
            18,  6640,     7,     2,     0,     8,  5666,   457,  3030,     3,
             2,  6434,    35,   732,  4994,   637,     7,     5, 14920,   426,
             1,  1806,     4,    32,   502,  2072,   847,    10,     2,  2076,
           712,     3,     0,  2034,    18,    30, 15250,  4062,    11,     5,
         12098,   170,     3,   103,    35,   331,    46,  3934,  1879,     0,
           972,  1258,     2,  1450,   799,    20, 15192,     0,     1]),
 tensor([ 3096,  6691,   161,   301,   349,  4774,     0,

# 4. 生成数据集迭代器

使用DataLoader，需要实现getitem和len方法,需要自行定义collate_fn

In [8]:
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        # 插入的是每个句子的长度
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    # 这里的text_list是one-dim的输入，因为每一个句子的长度不同，并且没有填充使得每个句子的长度相同
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [9]:
train_iter = AG_NEWS(split="train")
dataloader = DataLoader(train_iter, batch_size=16, shuffle=True, collate_fn=collate_batch)

# 5. 定义分类模型

使用EmbeddingBag layer和线性层作为分类器。
1. EmbeddingBag layer为bag中embeddings的均值,和，max，详细的用法[Doc](https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html#torch.nn.EmbeddingBag).
2. ``nn.EmbeddingBag`` module requires no padding here since the text lengths are saved in offsets.

In [10]:
import torch.nn as nn

class TextCLSModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class) -> None:
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.initrange = 0.5
        self.init_weights()

    def init_weights(self):
        # 对嵌入层初始化
        self.embedding.weight.data.uniform_(-self.initrange, self.initrange)
        self.fc.weight.data.uniform_(-self.initrange, self.initrange)
        # 将线性layer bias初始化为0
        self.fc.bias.data.zero_()

    def forward(self, text, offsets=None):
        # 一维的text idx向量，一定需要offset参数
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [11]:
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab) # 词汇表的长度
print("词汇表长度为: ", vocab_size)
emsize = 64
model = TextCLSModel(vocab_size, emsize, num_class).to(device)

词汇表长度为:  95811


### 准备训练
torch.nn.utils.clip_grad_norm_：梯度剪裁，防止梯度爆炸

In [12]:
import time

def train(dataloader, optimizer, criterion, epoch:int):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        pred_label = model(text, offsets)
        loss = criterion(pred_label, label)
        loss.backward()
        
        # 梯度剪裁，防止梯度爆炸
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (pred_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()


def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count
        


### 切分数据集

In [14]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS = 3  # epoch
LR = 5  # learning rate
BATCH_SIZE = 64  # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

# Decays the learning rate of each parameter group by gamma every step_size epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

total_accu = None
train_iter, test_iter = AG_NEWS()

# Convert iterable-style dataset to map-style dataset
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.9)
split_train_, split_valid_ = random_split(train_dataset, 
                                          [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
    )

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, optimizer, criterion, epoch)
    accu_val = evaluate(valid_dataloader)

    if total_accu is not None and total_accu > accu_val:
        scheduler.step() # 表示学习率太大了，需要降低学习率
    else:
        total_accu = accu_val

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)


| epoch   1 |   500/ 1688 batches | accuracy    0.933
| epoch   1 |  1000/ 1688 batches | accuracy    0.933
| epoch   1 |  1500/ 1688 batches | accuracy    0.932
-----------------------------------------------------------
| end of epoch   1 | time:  5.26s | valid accuracy    0.935 
-----------------------------------------------------------
| epoch   2 |   500/ 1688 batches | accuracy    0.941
| epoch   2 |  1000/ 1688 batches | accuracy    0.938
| epoch   2 |  1500/ 1688 batches | accuracy    0.934
-----------------------------------------------------------
| end of epoch   2 | time:  5.13s | valid accuracy    0.932 
-----------------------------------------------------------
| epoch   3 |   500/ 1688 batches | accuracy    0.948
| epoch   3 |  1000/ 1688 batches | accuracy    0.950
| epoch   3 |  1500/ 1688 batches | accuracy    0.952
-----------------------------------------------------------
| end of epoch   3 | time:  5.10s | valid accuracy    0.936 
-------------------------------

### 在测试集实验

In [15]:
accu_test = evaluate(test_dataloader)
print("test accuracy {:8.3f}".format(accu_test))

test accuracy    0.907


### 在其他的随机news上测试

In [17]:
ag_news_label = {1:"world", 2:"sports", 3:"Business", 4: "Sci/Tec"}

def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0])) # offset为0表示只有一个文本
        return output.argmax(1).item() + 1

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

model = model.to("cpu")

print("This is a %s news" % ag_news_label[predict(ex_text_str, text_pipeline)])

This is a sports news
