In [11]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoTokenizer
import torch
from datasets import load_dataset

In [12]:
device = torch.device("mps")
print("Using MPS backend.")


Using MPS backend.


In [13]:
squad_dataset = load_dataset('squad')
squad_dataset['train'] = squad_dataset['train'].shuffle(seed=42).select(range(1000))
squad_dataset['validation'] = squad_dataset['validation'].shuffle(seed=42).select(range(500))
dataset_with_length = squad_dataset.map(lambda x: {"length": len(x["context"])})

In [14]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')




In [15]:
def prepare_features(examples):
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop('overflow_to_sample_mapping')
    offset_mapping = tokenized_examples.pop('offset_mapping')

    tokenized_examples['start_positions'] = []
    tokenized_examples['end_positions'] = []

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        answers = examples['answers'][sample_index]
        input_ids = tokenized_examples['input_ids'][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        if len(answers['answer_start']) == 0:
            tokenized_examples['start_positions'].append(cls_index)
            tokenized_examples['end_positions'].append(cls_index)
        else:
            start_char = answers['answer_start'][0]
            end_char = start_char + len(answers['text'][0])
            sequence_ids = tokenized_examples.sequence_ids(i)
            context_start = sequence_ids.index(1)
            context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

            if offsets[context_start][0] > end_char or offsets[context_end][1] < start_char:
                tokenized_examples['start_positions'].append(cls_index)
                tokenized_examples['end_positions'].append(cls_index)
            else:
                token_start_index = context_start
                token_end_index = context_end

                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples['start_positions'].append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples['end_positions'].append(token_end_index + 1)

    return tokenized_examples


In [16]:
tokenized_datasets = squad_dataset.map(lambda x: prepare_features(x), batched=True, remove_columns=squad_dataset["train"].column_names)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [17]:
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['validation']
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions'])

In [18]:
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-cased')
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    fp16=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,3.3941,2.505551
2,1.873,2.299401


TrainOutput(global_step=250, training_loss=2.8919654235839842, metrics={'train_runtime': 250.3456, 'train_samples_per_second': 7.989, 'train_steps_per_second': 0.999, 'total_flos': 522593513472000.0, 'train_loss': 2.8919654235839842, 'epoch': 2.0})

In [19]:
trainer.save_model('./models/test_1k_model')
tokenizer.save_pretrained('./models/test_1k_model')

('./models/test_qa_model/tokenizer_config.json',
 './models/test_qa_model/special_tokens_map.json',
 './models/test_qa_model/vocab.txt',
 './models/test_qa_model/added_tokens.json',
 './models/test_qa_model/tokenizer.json')

In [20]:
eval_results = trainer.evaluate(eval_dataset)
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 2.299401044845581, 'eval_runtime': 15.4637, 'eval_samples_per_second': 32.398, 'eval_steps_per_second': 4.074, 'epoch': 2.0}


In [31]:
def evaluate_and_count_correct_answers(trainer, eval_dataset, tokenizer):
    # eval_results = trainer.evaluate(eval_dataset)
    # print("Evaluation Results:", eval_results)

    correct_count = 0
    total_count = len(eval_dataset)

    for example in eval_dataset:
        print(example)
        inputs = tokenizer(
            example["question"], example["context"],
            return_tensors="pt", truncation=True, max_length=512
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            start_index = torch.argmax(start_logits)
            end_index = torch.argmax(end_logits) + 1

            predicted_answer = tokenizer.decode(inputs["input_ids"][0][start_index:end_index]).lower()
            correct_answers = [answer.lower() for answer in example["answers"]["text"]]
            print(f"Predicted Answer: {predicted_answer}")
            print(f"Correct Answers: {correct_answers}")
            if predicted_answer in correct_answers:
                correct_count += 1

    accuracy = correct_count / total_count
    print(f"Correct Answers: {correct_count}/{total_count}")
    print(f"Accuracy: {accuracy:.2f}")

# Evaluate the model and count correct answers

In [32]:
evaluate_and_count_correct_answers(trainer, squad_dataset['validation'], tokenizer)

{'id': '572759665951b619008f8884', 'title': 'Private_school', 'context': 'Private schooling in the United States has been debated by educators, lawmakers and parents, since the beginnings of compulsory education in Massachusetts in 1852. The Supreme Court precedent appears to favor educational choice, so long as states may set standards for educational accomplishment. Some of the most relevant Supreme Court case law on this is as follows: Runyon v. McCrary, 427 U.S. 160 (1976); Wisconsin v. Yoder, 406 U.S. 205 (1972); Pierce v. Society of Sisters, 268 U.S. 510 (1925); Meyer v. Nebraska, 262 U.S. 390 (1923).', 'question': 'In what year did Massachusetts first require children to be educated in schools?', 'answers': {'text': ['1852', '1852', '1852'], 'answer_start': [158, 158, 158]}}
Predicted Answer: 1852
Correct Answers: ['1852', '1852', '1852']
{'id': '57296de03f37b3190047839e', 'title': 'Chloroplast', 'context': "The chloroplast membranes sometimes protrude out into the cytoplasm, fo