In [1]:
import json
from datasets import Dataset

# Load your JSON file
with open('./train-v1.1.json') as f:
    data = json.load(f)

# Convert your data to a list of dictionaries
examples = []
for item in data['data']:
    for paragraph in item['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            answer = qa['answers'][0]  # Using the first answer for training
            examples.append({
                'context': context,
                'question': question,
                'answers': {
                    'text': answer['text'],
                    'answer_start': answer['answer_start']
                }
            })

# Create a Hugging Face Dataset object
squad_dataset = Dataset.from_list(examples)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"  # or any other pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = tokenizer(
        examples['question'],
        examples['context'],
        max_length=384,
        truncation="only_second",  # Truncate context, not question
        return_offsets_mapping=True,
        padding="max_length"
    )
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(inputs["offset_mapping"]):
        answer = examples["answers"][i]
        start_char = answer["answer_start"]
        end_char = start_char + len(answer["text"])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        # If the answer is out of context range, set positions to 0
        if start_char < inputs["offset_mapping"][i][context_start][0] or end_char > inputs["offset_mapping"][i][context_end][1]:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Find the token indices for the answer
            start_positions.append(
                next(idx for idx, mapping in enumerate(inputs["offset_mapping"][i]) if mapping[0] <= start_char < mapping[1])
            )
            end_positions.append(
                next(idx for idx, mapping in enumerate(inputs["offset_mapping"][i]) if mapping[0] < end_char <= mapping[1])
            )

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_dataset = squad_dataset.map(preprocess_function, batched=True, remove_columns=squad_dataset.column_names)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 87599/87599 [01:07<00:00, 1289.45 examples/s]


In [3]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md






  attn_output = torch.nn.functional.scaled_dot_product_attention(
  3%|▎         | 500/16425 [09:11<4:53:11,  1.10s/it]

{'loss': 2.8586, 'grad_norm': 17.295013427734375, 'learning_rate': 1.9391171993911722e-05, 'epoch': 0.09}


  6%|▌         | 1000/16425 [18:25<4:43:35,  1.10s/it]

{'loss': 1.8559, 'grad_norm': 20.600038528442383, 'learning_rate': 1.8782343987823442e-05, 'epoch': 0.18}


  8%|▊         | 1261/16425 [23:17<4:39:06,  1.10s/it]