<a href="https://colab.research.google.com/github/AliEbadi110/Natural-Language-Processing-Question-Answering-Sample-Projects/blob/main/NLP_Transformers_Question_Answering_SQuAD_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLP - Transformers - Question Answering - SQuAD Dataset**

In [None]:
!pip install datasets
!pip install transformers[torch]

In [None]:
import numpy as np
import torch
from tqdm.autonotebook import tqdm

from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer

  from tqdm.autonotebook import tqdm


## 1. Loading Data

In [None]:
raw_datasets = load_dataset('squad')
raw_datasets

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
raw_datasets['train'][1]

{'id': '5733be284776f4190066117f',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'What is in front of the Notre Dame Main Building?',
 'answers': {'text': ['a copper statue of Christ'], 'answer_start': [188]}}

In [None]:
raw_datasets['train'][1]['answers']['text'][0]

'a copper statue of Christ'

In [None]:
raw_datasets['train'].features

{'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}

## 2. Preprocessing

In [None]:
checkpoint = 'bert-base-cased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
def find_answer_token_idx(ctx_start, ctx_end, ans_start_char, ans_end_char, offset):
  start_idx = 0
  end_idx = 0
  if offset[ctx_start][0] > ans_start_char or offset[ctx_end][1] < ans_end_char:
    pass
  else:
    i = ctx_start
    for start_end_char in offset[ctx_start:]:
      start, end = start_end_char
      if start == ans_start_char:
        start_idx = i
      if end == ans_end_char:
        end_idx = i
        break

      i += 1
  return start_idx, end_idx

In [None]:
max_length = 384
stride = 128

def tokenize_func_train(batch):
  questions = [q.strip() for q in batch['question']]

  inputs = tokenizer(
    questions,
    batch['context'],
    max_length=max_length,
    truncation='only_second',
    stride=stride,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding='max_length'
  )

  offset_mapping = inputs.pop('offset_mapping')
  orig_sample_idxs = inputs.pop('overflow_to_sample_mapping')
  answers = batch['answers']
  start_idxs, end_idxs = [], []

  for i, offset in enumerate(offset_mapping):
    sample_idx = orig_sample_idxs[i]
    answer = answers[sample_idx]

    ans_start_char = answer['answer_start'][0]
    ans_end_char = ans_start_char + len(answer['text'][0])

    sequence_ids = inputs.sequence_ids(i)

    ctx_start = sequence_ids.index(1)
    ctx_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

    start_idx, end_idx = find_answer_token_idx(ctx_start, ctx_end, ans_start_char, ans_end_char, offset)

    start_idxs.append(start_idx)
    end_idxs.append(end_idx)

  inputs['start_positions'] = start_idxs
  inputs['end_positions'] = end_idxs
  return inputs

In [None]:
tokenized_train_dataset = raw_datasets['train'].map(
    tokenize_func_train,
    batched=True,
    remove_columns=raw_datasets['train'].column_names
    )
len(raw_datasets['train']), len(tokenized_train_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

(87599, 88729)

In [None]:
def tokenize_func_validation(batch):
  questions = [q.strip() for q in batch['question']]

  inputs = tokenizer(
    questions,
    batch['context'],
    max_length=max_length,
    truncation='only_second',
    stride=stride,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding='max_length'
  )

  orig_sample_idxs = inputs.pop('overflow_to_sample_mapping')
  sample_ids = []

  for i in range(len(inputs['input_ids'])):
    sample_idx = orig_sample_idxs[i]
    sample_ids.append(batch['id'][sample_idx])

    sequence_ids = inputs.sequence_ids(i)
    offset = inputs['offset_mapping'][i]
    inputs['offset_mapping'][i] = [x if sequence_ids[j] == 1 else None for j, x in enumerate(offset)]

  inputs['sample_id'] = sample_ids
  return inputs

In [None]:
tokenized_validation_dataset = raw_datasets['validation'].map(
    tokenize_func_validation,
    batched=True,
    remove_columns=raw_datasets['train'].column_names
    )
len(raw_datasets['validation']), len(tokenized_validation_dataset)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

(10570, 10822)

## 3. Train Model

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments('trainer_dir',
                                  evaluation_strategy='no',
                                  save_strategy='epoch',
                                  num_train_epochs=1,
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                  fp16=True,
                                  )

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
)

In [None]:
# It takes 2.5 hours to train for 3 epochs
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,2.6554
1000,1.5809
1500,1.4649
2000,1.357
2500,1.3038
3000,1.2631
3500,1.2375
4000,1.2079
4500,1.1929
5000,1.1426


TrainOutput(global_step=11092, training_loss=1.2352877767172867, metrics={'train_runtime': 2708.2038, 'train_samples_per_second': 32.763, 'train_steps_per_second': 4.096, 'total_flos': 1.7388449946321408e+16, 'train_loss': 1.2352877767172867, 'epoch': 1.0})

## 4. Evaluate

In [None]:
metric = load_metric('squad')
n_largest = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits, processed_dataset, orig_dataset):
  sample_id2idxs = {}
  for i, id_ in enumerate(processed_dataset['sample_id']):
    if id_ not in sample_id2idxs:
      sample_id2idxs[id_] = [i]
    else:
      sample_id2idxs[id_].append(i)

  predicted_answers = []
  for sample in tqdm(orig_dataset):
    sample_id = sample['id']
    context = sample['context']
    best_score = float('-inf')
    best_answer = None
    for idx in sample_id2idxs[sample_id]:
      start_logit = start_logits[idx]
      end_logit = end_logits[idx]
      offsets = processed_dataset[idx]['offset_mapping']
      start_indices = (-start_logit).argsort()
      end_indices = (-end_logit).argsort()
      for start_idx in start_indices[:n_largest]:
        for end_idx in end_indices[:n_largest]:
          if offsets[start_idx] is None or offsets[end_idx] is None:
            continue
          if end_idx < start_idx:
            continue
          if end_idx - start_idx + 1 > max_answer_length:
            continue
          score = start_logit[start_idx] + end_logit[end_idx]
          if score > best_score:
            best_score = score
            first_ch = offsets[start_idx][0]
            last_ch = offsets[end_idx][1]
            best_answer = context[first_ch:last_ch]
    predicted_answers.append({'id': sample_id, 'prediction_text': best_answer})
  true_answers = [{'id': x['id'], 'answers': x['answers']} for x in orig_dataset]
  return metric.compute(predictions=predicted_answers, references=true_answers)

In [None]:
trainer_output = trainer.predict(tokenized_validation_dataset)

In [None]:
predictions, _, _ =trainer_output

In [None]:
start_logits, end_logits = predictions

In [None]:
compute_metrics(start_logits, end_logits, tokenized_validation_dataset, raw_datasets['validation'])

  0%|          | 0/10570 [00:00<?, ?it/s]

{'exact_match': 80.1608325449385, 'f1': 87.53992862261228}