In [57]:
from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertPreTrainedModel,
    BertModel,
    TrainingArguments,
    Trainer,
)
from torch.utils.data import DataLoader
import torch.nn as nn

In [58]:
dataset = load_dataset("squad_v2")
print("数据集结构:", dataset)

# 查看训练集的第一条样本
example = dataset["train"][0]
print("\n第一条样本:", example)

数据集结构: DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

第一条样本: {'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number

In [59]:
def process_example(example):
    context = example["context"]
    question = example["question"]
    answers = example["answers"]
    
    is_impossible = len(answers["text"]) == 0
    
    if is_impossible:
        return {
            "context": context,
            "question": question,
            "answer_start": None,
            "answer_text": None,
        }
    else:
        answer_start = answers["answer_start"][0]
        answer_text = answers["text"][0]
        return {
            "context": context,
            "question": question,
            "answer_start": answer_start,
            "answer_text": answer_text,
        }


tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_func(target,tokenizer):
    tokenized_inputs = tokenizer(
        target["question"],
        target["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = tokenized_inputs.pop("offset_mapping")

    tokenized_inputs["start_positions"] = []
    tokenized_inputs["end_positions"] = []
    tokenized_inputs["is_impossible"]=[]

    for i,offsets in enumerate(offset_mapping):
        input_ids=tokenized_inputs["input_ids"][i]
        cls_index=input_ids.index(tokenizer.cls_token_id)

        if target["answer_start"] is None:
            tokenized_inputs["start_positions"].append(cls_index)
            tokenized_inputs["end_positions"].append(cls_index)
            tokenized_inputs["is_impossible"].append(True)
            
        else:
            start_char=target["answer_start"]
            end_char = start_char + len(target["answer_text"])
            token_start_index = 0
            while offsets[token_start_index][0] < start_char:
                token_start_index += 1
            
            token_end_index = len(input_ids) - 1
            while offsets[token_end_index][1] > end_char:
                token_end_index -= 1
            
            if (offsets[token_start_index][0] <= start_char and 
                offsets[token_end_index][1] >= end_char):
                tokenized_inputs["start_positions"].append(token_start_index)
                tokenized_inputs["end_positions"].append(token_end_index)
                tokenized_inputs["is_impossible"].append(False)
            else:
                tokenized_inputs["start_positions"].append(cls_index)
                tokenized_inputs["end_positions"].append(cls_index)
                tokenized_inputs["is_impossible"].append(True)
    return tokenized_inputs

train_data=dataset["train"].map(process_example,num_proc=6)
test_data=dataset["validation"].map(process_example,num_proc=6)

train_encoding=train_data.map(lambda examples: tokenize_func(examples, tokenizer=tokenizer),batched=True,num_proc=6)
test_encoding=test_data.map(lambda examples: tokenize_func(examples, tokenizer=tokenizer),batched=True,num_proc=6)
print(train_encoding)


Map (num_proc=6):   0%|          | 0/130319 [00:00<?, ? examples/s]

NameError: name 'tokenize_func' is not defined

Map (num_proc=6):   0%|          | 0/130319 [00:00<?, ? examples/s]

NameError: name 'tokenize_func' is not defined