# QA阅读理解

## Step1 导入相关包

In [None]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator

## Step2 数据集加载

In [None]:
# 如果可以联网，直接使用load_dataset进行加载
#datasets = load_dataset("cmrc2018", cache_dir="data")
# 如果无法联网，则使用下面的方式加载数据集
datasets = DatasetDict.load_from_disk("mrc_data")
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 10142
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 3219
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 1002
    })
})

In [None]:
datasets["train"][0]

## Step3 数据预处理

In [None]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
tokenizer

In [None]:
# todo:抽取10条context,查看长度，如果太长则需要想办法截断
sample_dataset = datasets["train"].select(range(10))
tokenizer_examples = tokenizer(text = sample_dataset)


In [None]:
tokenized_examples = tokenizer(text=sample_dataset["question"],
                               text_pair=sample_dataset["context"],
                               return_offsets_mapping=True,
                               max_length=512, truncation="only_second", padding="max_length")
tokenized_examples.keys()

In [None]:
# todo:tokenizer是AutoTokenizer的对象，ctrl+alt+L没法查看源代码，只有通过对象的__call__方法才能查看
help(tokenizer.__call__)


文档来源于https://huggingface.co/docs/transformers/en/main_classes/tokenizer，在pycharm作为对象通过__call__方法打开。
    Args:
        text (`str`, `list[str]`, `list[list[str]]`, *optional*):
            The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
            (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
            `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
        text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*):
            The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
            (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
            `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).


-tokenized_examples = tokenizer(text=sample_dataset["question"],
                               text_pair=sample_dataset["context"],
                               text: 这是你要处理的主要输入文本。可以是一个句子，也可以是多个句子组成的列表。在问答任务中，这通常是“问题”。

-text_pair: 这是与text配对的第二个输入文本。当你的任务需要两个文本输入时使用，比如问答（问题+上下文）或句子对分类。在问答任务中，这通常是“上下文”。

-text_target: 这是目标文本，主要用于序列到序列（Seq2Seq）模型，比如翻译或摘要任务。它代表你希望模型生成的“正确答案”文本。例如，在翻译任务中，这是翻译后的目标语言句子。

-text_pair_target: 与text_target类似，这是配对的目标文本。使用场景较少，但在一些复杂的多输入多输出任务中可能会用到。

-add_special_tokens: 一个开关，决定是否要自动添加模型的特殊标记（比如[CLS], [SEP], <s>, </s>）。默认是打开的（True），因为大多数模型都需要这些特殊标记来理解句子的开始、结束和分隔。

-padding: 控制填充操作。因为一个批次（batch）中的句子长度不同，需要将短句子用特殊的填充符[PAD]补齐，使它们的长度与批次中最长的句子一致，这样才能形成规整的张量输入给模型。=True,
                               max_length=512, truncation="only_second", padding="max_length")
-tokenized_examples.keys()
-return_offsets_mapping（bool，可选，默认为False）让分词器额外返回一个“偏移量映射表” ，这个映射表精确地记录了每一个Token（词元）对应其在原始字符串中的起始和结束字符索引
比如:偏移量i love huggingface:  (15, 19),   # 'face'   对应原始字符串的第 15 个到第 19 个字符 ('Face')

In [None]:
print(tokenized_examples["offset_mapping"][0], len(tokenized_examples["offset_mapping"][0]))

In [None]:
offset_mapping = tokenized_examples.pop("offset_mapping")

In [None]:
for idx, offset in enumerate(offset_mapping):
    answer = sample_dataset[idx]["answers"]
    start_char = answer["answer_start"][0]
    end_char = start_char + len(answer["text"][0])
    # 定位答案在token中的起始位置和结束位置
    # 一种策略，我们要拿到context的起始和结束，然后从左右两侧向答案逼近

    context_start = tokenized_examples.sequence_ids(idx).index(1)
    context_end = tokenized_examples.sequence_ids(idx).index(None, context_start) - 1

    # 判断答案是否在context中
    if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
        start_token_pos = 0
        end_token_pos = 0
    else:
        token_id = context_start
        while token_id <= context_end and offset[token_id][0] < start_char:
            token_id += 1
        start_token_pos = token_id
        token_id = context_end
        while token_id >= context_start and offset[token_id][1] > end_char:
            token_id -=1
        end_token_pos = token_id
        
    print(answer, start_char, end_char, context_start, context_end, start_token_pos, end_token_pos)
    print("token answer decode:", tokenizer.decode(tokenized_examples["input_ids"][idx][start_token_pos: end_token_pos + 1]))

In [None]:
def process_func(examples):
    tokenized_examples = tokenizer(text=examples["question"],
                               text_pair=examples["context"],
                               return_offsets_mapping=True,
                               max_length=384, truncation="only_second", padding="max_length")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    start_positions = []
    end_positions = []
    for idx, offset in enumerate(offset_mapping):
        answer = examples["answers"][idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        # 定位答案在token中的起始位置和结束位置
        # 一种策略，我们要拿到context的起始和结束，然后从左右两侧向答案逼近
        context_start = tokenized_examples.sequence_ids(idx).index(1)
        context_end = tokenized_examples.sequence_ids(idx).index(None, context_start) - 1
        # 判断答案是否在context中
        if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
            start_token_pos = 0
            end_token_pos = 0
        else:
            token_id = context_start
            while token_id <= context_end and offset[token_id][0] < start_char:
                token_id += 1
            start_token_pos = token_id
            token_id = context_end
            while token_id >= context_start and offset[token_id][1] > end_char:
                token_id -=1
            end_token_pos = token_id
        start_positions.append(start_token_pos)
        end_positions.append(end_token_pos)
    
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

In [None]:
tokenied_datasets = datasets.map(process_func, batched=True, remove_columns=datasets["train"].column_names)
tokenied_datasets

## Step4 加载模型

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("hfl/chinese-macbert-base")

## Step5 配置TrainingArguments

In [None]:
args = TrainingArguments(
    output_dir="models_for_qa",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=3
)

## Step6 配置Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenied_datasets["train"],
    eval_dataset=tokenied_datasets["validation"],
    data_collator=DefaultDataCollator()
)

## Step7 模型训练

In [None]:
trainer.train()

## Step8 模型预测

In [None]:
from transformers import pipeline

pipe = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)

In [None]:
pipe(question="小明在哪里上班？", context="小明在北京上班。")