##### load dataset

In [8]:
# load squad
from datasets import load_dataset, load_metric
squad_v2 = False
datasets = load_dataset("squad_v2" if squad_v2 else "squad")

Using the latest cached version of the module from C:\Users\jeomin\.cache\huggingface\modules\datasets_modules\datasets\squad\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453 (last modified on Sat Mar 25 14:33:01 2023) since it couldn't be found locally at squad., or remotely on the Hugging Face Hub.
Found cached dataset squad (C:/Users/jeomin/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 45.45it/s]


In [23]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [2]:
# load our dataset
# https://huggingface.co/docs/datasets/loading#from-local-files
from datasets import load_dataset
dataset_file = 'data/our_training_dataset.json'
datasets = load_dataset("json", data_files=dataset_file)
datasets = datasets['train']
datasets = datasets.train_test_split(test_size=0.1)

Downloading and preparing dataset json/default to C:/Users/jeomin/.cache/huggingface/datasets/json/default-6004272bb7647219/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
                                                        

Dataset json downloaded and prepared to C:/Users/jeomin/.cache/huggingface/datasets/json/default-6004272bb7647219/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 500.16it/s]


In [3]:
datasets

DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers'],
        num_rows: 948
    })
    test: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers'],
        num_rows: 106
    })
})

##### data preprocess tokenize
1/Some examples in a dataset may have a very long context that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the context by setting truncation="only_second".\
2/Next, map the start and end positions of the answer to the original context by setting return_offset_mapping=True.\
3/With the mapping in hand, now you can find the start and end tokens of the answer. Use the sequence_ids method to find which part of the offset corresponds to the question and which corresponds to the context.

In [5]:
from transformers import AutoTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments, DefaultDataCollator
from transformers import PreTrainedTokenizerFast
# hyperparam
model_checkpoint = 'bert-large-uncased-whole-word-masking-finetuned-squad'

# The maximum length of a feature (question and context)
max_length = 384
# The authorized overlap between two part of the context when splitting it is needed.
doc_stride = 64

# load a tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, PreTrainedTokenizerFast)


In [6]:
"""
https://huggingface.co/docs/transformers/main/en/training#train-in-native-pytorch
https://huggingface.co/docs/transformers/main/en/tasks/question_answering
"""
def preprocess_function(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    questions = [q.strip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        return_offsets_mapping=True,
        return_overflowing_tokens=True,
        stride=doc_stride,
        padding="max_length",
    )
    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = inputs.pop("offset_mapping")
    
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = inputs.sequence_ids(i)
        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answer = examples["answers"][sample_index]
        
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        
        # Start token index of the current span in the text.
        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [10]:
"""
To apply this function on all the sentences (or pairs of sentences) in our dataset, 
we just use the map method of our dataset object we created earlier. 
This will apply the function on all the elements of all the splits in dataset, so our training,
validation and testing data will be preprocessed in one single command. 
Since our preprocessing changes the number of samples, we need to remove the old columns when applying it.
"""
tokenized_datasets = datasets.map(preprocess_function, batched=True, remove_columns=datasets["train"].column_names)


Map:  98%|█████████▊| 86000/87599 [00:58<00:00, 3168.75 examples/s]

##### train

In [None]:
"""
1.Define your training hyperparameters in TrainingArguments. 
The only required parameter is output_dir which specifies where to save your model. 
You'll push this model to the Hub by setting push_to_hub=True 
(you need to be signed in to Hugging Face to upload your model).
"""
batch_size = 8
training_args = TrainingArguments(
    output_dir="comp_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# load the model
model = BertForQuestionAnswering.from_pretrained(model_checkpoint)

"""
2.Pass the training arguments to Trainer along with the model, dataset, tokenizer, and data collator.
"""
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(1000))
dataloader = DefaultDataCollator()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=dataloader,
)

In [None]:
"""
3.fine tune the model.
"""
trainer.train()

In [None]:
# save the model if you need.
trainer.save_model("test-squad-trained")

preprocess
https://huggingface.co/docs/transformers/main/en/preprocessing
https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb

fine_tune
https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb
https://huggingface.co/docs/transformers/main/en/training#train-in-native-pytorch

qa
https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb
https://huggingface.co/docs/transformers/main/en/tasks/question_answering
https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa.py