In [1]:
# Transformers installation
! pip install transformers datasets evaluate accelerate
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->a

In [None]:
#常见的问答任务有两种类型：

#抽取式：从给定的语境中提取答案。
#抽象式：从语境中生成能正确回答问题的答案

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [5]:
#加载数据集SQuAD，这是一个广泛用于问答任务的英文数据集
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")#split="train[:5000]表示从训练集中选取前 5000 个样本

In [6]:
squad = squad.train_test_split(test_size=0.2)#划分训练集和测试集，分割后的squad变为一个包含两个子集的字典

In [7]:
squad["train"][0]

{'id': '56cbfbdd6d243a140015ee36',
 'title': 'Frédéric_Chopin',
 'context': 'In the spring of 1834, Chopin attended the Lower Rhenish Music Festival in Aix-la-Chapelle with Hiller, and it was there that Chopin met Felix Mendelssohn. After the festival, the three visited Düsseldorf, where Mendelssohn had been appointed musical director. They spent what Mendelssohn described as "a very agreeable day", playing and discussing music at his piano, and met Friedrich Wilhelm Schadow, director of the Academy of Art, and some of his eminent pupils such as Lessing, Bendemann, Hildebrandt and Sohn. In 1835 Chopin went to Carlsbad, where he spent time with his parents; it was the last time he would see them. On his way back to Paris, he met old friends from Warsaw, the Wodzińskis. He had made the acquaintance of their daughter Maria in Poland five years earlier, when she was eleven. This meeting prompted him to stay for two weeks in Dresden, when he had previously intended to return to Paris via Le

In [None]:
#answers：答案标记的起始位置以及答案文本。
#context：模型需要从中提取答案的背景信息。
#question：模型需要回答的问题。

In [8]:
#加载分词器
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [9]:
#将原始文本数据转换为模型可接受的输入格式
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]#提取问题，并且去除问题前后空格
    inputs = tokenizer(#分词
        questions,#问题
        examples["context"],#上下文
        max_length=384,#最大序列长度
        truncation="only_second",#仅截断上下文，不截断问题
        return_offsets_mapping=True,#返回每个token在原始文本中的起始和结束字符位置
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")#字符偏移映射
    answers = examples["answers"]#原始答案
    start_positions = []#存储答案起始token位置
    end_positions = []#结束位置

    #遍历每个样本，计算答案位置
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]#答案在原文中的起始字符位置
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        #确定上下文的token范围
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        # 找到上下文的结束token索引
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # 如果答案不完全在上下文中，标记为(0, 0)，可能是上下文被截断
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            #找到起止位置
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions#结果包含分词后的输入，还有这里新添加的起止位置
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
#应用于整个数据集
tokenized_squad = squad.map(preprocess_function,
                batched=True, #启用批处理
                remove_columns=squad["train"].column_names)#删除不需要的原始列

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
#创建一个样本批次，DefaultDataCollator不做任何额外处理
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [12]:
#加载DistilBERT模型
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
#训练参数配置
training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none"
)

#初始化
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
)
#开始训练
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.351314
2,2.711000,1.716369
3,2.711000,1.618855


TrainOutput(global_step=750, training_loss=2.2851683349609373, metrics={'train_runtime': 478.2832, 'train_samples_per_second': 25.09, 'train_steps_per_second': 1.568, 'total_flos': 1175877900288000.0, 'train_loss': 2.2851683349609373, 'epoch': 3.0})

In [None]:
trainer.push_to_hub()

In [14]:
#进行测试
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [17]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="stevhliu/my_awesome_qa_model")
question_answerer(question=question, context=context)

config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/360 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


{'score': 0.20582705736160278,
 'start': 10,
 'end': 95,
 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}

In [19]:
#以下手动实现
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_qa_model")
inputs = tokenizer(question, context, return_tensors="pt")

In [21]:
import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("stevhliu/my_awesome_qa_model")
with torch.no_grad():
    outputs = model(**inputs)

In [22]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [23]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'176 billion parameters and can generate text in 46 languages natural languages and 13'