问答模型

In [19]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch import functional
from torch.utils.data import DataLoader, Dataset
# Load the tokenizer and model
from sacrebleu.metrics import BLEU

tokenizer = T5Tokenizer.from_pretrained("Langboat/mengzi-t5-base")
model = T5ForConditionalGeneration.from_pretrained("Langboat/mengzi-t5-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [20]:
print(device)
model

cuda


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

数据集处理

In [21]:
import json
class QADataSet(Dataset):
    def __init__(self, file_path):
        super().__init__()
        self.data_path = file_path
        self.data = self.load_data()
    
    def load_data(self):
        data = []
        with open(self.data_path, 'rt', encoding='utf-8') as f:
            for line in f:
                json_data = json.loads(line)
                question = json_data["question"]
                context = json_data["context"]
                data.append({
                    "input": f"问题是:{question},文章:{context}",
                    "answer": json_data["answer"]
                })
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    

In [22]:
from torch.utils.data import random_split
dataset = QADataSet('DuReaderQG/train.json')
train_dataset, valid_dataset = random_split(dataset, [int(len(dataset) * 0.8), len(dataset) - int(len(dataset) * 0.8)])
test_dataset = QADataSet('DuReaderQG/dev.json')
print(len(train_dataset), len(test_dataset))

11616 984


In [23]:
maxlength = 0
for data in train_dataset:
    inputs = tokenizer(data["input"],truncation=True, padding=True, max_length=10240, return_tensors="pt")
    maxlength = max(maxlength, inputs.input_ids.shape[1])
    inputs = tokenizer(data["answer"],truncation=True, padding=True, max_length=10240, return_tensors="pt")
    maxlength = max(maxlength, inputs.input_ids.shape[1])

for data in test_dataset:
    inputs = tokenizer(data["input"],truncation=True, padding=True, max_length=10240, return_tensors="pt")
    maxlength = max(maxlength, inputs.input_ids.shape[1])
    inputs = tokenizer(data["answer"],truncation=True, padding=True, max_length=10240, return_tensors="pt")
    maxlength = max(maxlength, inputs.input_ids.shape[1])
maxlength

1187

In [24]:
inputs = tokenizer(train_dataset[0]["input"], truncation=True, padding=True, max_length=1280, return_tensors="pt")
print(inputs)
print(tokenizer.convert_ids_to_tokens(inputs.input_ids[0]))

{'input_ids': tensor([[ 8080,    13,   205,  7153,   264,   698,  3368,  2158,  3653,    98,
             3,  1385,    13,   225,  1362,    51,   205,  7153,   264,   698,
          3368,  2158,    44,   153,   314,   273,  1993, 25753,     3,   285,
           273,  2550,    71,   907,   424,    33,  8934,     3,   205,  7153,
           264,   698,  3368,  2158,  3653,    98,  9994,   136,    17,  6186,
             3,   562, 19585,     7,   688,   231,  3932,  5017,    51,   205,
          7153,   264,   698,    44,   379,   173,  2158,     3,   379,   108,
          2158,  5293,  4504,    64,  8493,  2927,    24,   165,    50,  1084,
            70, 19991, 20346,     3, 17666,  2139,   142,    30,  4734,    51,
          1364,  2137,  1748,     7,  2635,   562,  1563,  2092, 17489,   947,
             7,  1176,  1748, 19258,  2586,   205,  7153,   264,   698,  3368,
          2158,    44,   266,   647,   341,  1919,  1527, 23404,    22,  1364,
          6765,  1701,     7, 27038, 2

In [25]:
def collate_fn(batch):
    inputs = []
    answers = []
    for b in batch:
        inputs.append(b["input"])
        answers.append(b["answer"])

    batch_data = tokenizer(inputs, truncation=True, padding=True, max_length=1280, return_tensors="pt")

    with tokenizer.as_target_tokenizer():
        answer_token = tokenizer(answers, truncation=True, padding=True, max_length=1280, return_tensors="pt").input_ids
        
        batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(answer_token)
        eos_token_id = torch.where(answer_token == tokenizer.eos_token_id)[1]
        for idx, eos_id in enumerate(eos_token_id):
            answer_token[idx][eos_id + 1:] = -100  # Mask out the tokens after the EOS token
        batch_data['labels'] = answer_token
    
    return batch_data

In [26]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [27]:
batch = next(iter(train_loader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch['decoder_input_ids'][0])
print(batch['labels'][0])

dict_keys(['input_ids', 'attention_mask', 'decoder_input_ids', 'labels'])
batch shape: {'input_ids': torch.Size([32, 606]), 'attention_mask': torch.Size([32, 606]), 'decoder_input_ids': torch.Size([32, 11]), 'labels': torch.Size([32, 11])}
tensor([    0,   629, 15781,     1,     0,     0,     0,     0,     0,     0,
            0])
tensor([  629, 15781,     1,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100])




训练

In [45]:
from tqdm.auto import tqdm
def train_loop(dataloader,model,optimizer,epoch, lr_scheduler,total_loss,device):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)
    model.train()
    
    for batch,data in enumerate(dataloader,start=1):
        data = data.to(device)
        output = model(**data)
        loss = output.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss / (finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    
    return total_loss


In [46]:
# bleu1
blue_1 = BLEU(max_ngram_order=1)
blue_2 = BLEU(max_ngram_order=2)
blue_3 = BLEU(max_ngram_order=3)
blue_4 = BLEU()

In [47]:
blue_2

<sacrebleu.metrics.bleu.BLEU at 0x1e3dbc92d10>

In [48]:
import numpy as np
def test_loop(dataloader,tokenizer,model,device):
    labels = []
    predictions = []
    model.eval()
    for data in dataloader:
        data = data.to(device)
        with torch.no_grad():
            output = model.generate(data["input_ids"],
                attention_mask=data["attention_mask"],
                max_length=1280,
                num_beams=4,
                no_repeat_ngram_size=2,
            )
        if isinstance(output, tuple):
            output = output[0]
        
        decoded_preds = tokenizer.batch_decode(output, skip_special_tokens=True)
        predictions += [' '.join(pred.strip()) for pred in decoded_preds]

        label_token = data["labels"].cpu().numpy()
        label_token = np.where(label_token == -100, tokenizer.pad_token_id, label_token)
        decoded_label = tokenizer.batch_decode(label_token, skip_special_tokens=True)

        labels += [' '.join(label.strip()) for label in decoded_label]
    
    bleu1 = blue_1.corpus_score(predictions, [labels]).score
    bleu2 = blue_2.corpus_score(predictions, [labels]).score
    bleu3 = blue_3.corpus_score(predictions, [labels]).score
    bleu4 = blue_4.corpus_score(predictions, [labels]).score
    print(f"BLEU-1: {bleu1:.2f}, BLEU-2: {bleu2:.2f}, BLEU-3: {bleu3:.2f}, BLEU-4: {bleu4:.2f}")
    return bleu1, bleu2, bleu3, bleu4



In [50]:
from matplotlib import pyplot as plt

lr = 1e-4
epochs = 10
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
from transformers import get_scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * epochs,
)
loss_history = []
maxbleusum = 0.0
for epoch in range(epochs):
    print(f"Epoch {epoch}/{epochs}")
    total_loss = 0.0
    total_loss = train_loop(train_loader, model, optimizer, epoch + 1, lr_scheduler, total_loss, device)
    loss_history.append(total_loss)
    bleu1, bleu2, bleu3, bleu4 = test_loop(valid_loader, tokenizer, model, device)
    bleusum = (bleu1 + bleu2 + bleu3 + bleu4) / 4
    if bleusum > maxbleusum or epoch == 0:
        maxbleusum = bleusum
        model.save_pretrained(f"mengzi-t5-base-finetuned-epoch-{epoch}")
        print(f"Model saved at epoch {epoch} with BLEU sum: {bleusum:.2f}")

# 绘制 loss 曲线
plt.figure(figsize=(10, 5))
plt.plot(loss_history, marker='o')
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.title("Training Loss Curve")
plt.show()

Epoch 0/10


  0%|          | 0/363 [00:00<?, ?it/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 98.00 MiB. GPU 0 has a total capacty of 23.99 GiB of which 0 bytes is free. Of the allocated memory 33.81 GiB is allocated by PyTorch, and 4.03 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF