<a href="https://colab.research.google.com/github/ArijaK/QuestionAnswering/blob/main/QA_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Question Answering Model**

The code below (data preparation and evaluation) is based on Hugging Face *Question answering* examples, availabe [here (chapter)](https://huggingface.co/learn/nlp-course/chapter7/7?fw=pt) and [here (notebook)](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb).



In [None]:
!pip install datasets
!pip install transformers[torch]

In [None]:
# Flags.
# Data preprocessing.
DATASET = 'squad_v2'
MODEL_CHECKPOINT = 'albert-base-v2'
# Maximum length of a feature (question and context).
MAX_LENGTH = 384
# Number of overlapping tokens.
STRIDE = 128
# Fine-tuning.
TRAIN = False
# Evaluation.
N_BEST = 50
# Usually sentences do not exceed this length.
MAX_ANSWER_LENGTH = 40
# Path to fine-tuned model.
USE_SAVED = True
MODEL_PATH = 'drive/MyDrive/Colab Notebooks/Fine-tuned_models/albert-base-v2-squadv2'

## Data preparation

In [None]:
# Easy way to load the dataset.
from datasets import load_dataset
dataset = load_dataset(DATASET)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [None]:
# Preprocessing.
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [None]:
# Check if a fast tokenizer is implemented.
assert tokenizer.is_fast

In [None]:
# Call the tokenizer on a simple example.
inputs = tokenizer('Where can I buy cat food?', 'Cat food is sold in all pet stores. You can buy cat food online too.')
tokenizer.decode(inputs['input_ids'])

'[CLS] where can i buy cat food?[SEP] cat food is sold in all pet stores. you can buy cat food online too.[SEP]'

In [None]:
def preprocess_train_data(data):
  # Remove leading and trailing whitespaces.
  data['question'] = [q.strip() for q in data['question']]

  inputs = tokenizer(
      data['question'],
      data['context'],
      truncation='only_second',
      max_length=MAX_LENGTH,
      stride=STRIDE,
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
      padding='max_length',
  )

  # Map from a feature to its corresponding dataset sample.
  sample_mapping = inputs.pop('overflow_to_sample_mapping')
  # Map from token to character position in the original context.
  offset_mapping = inputs.pop('offset_mapping')

  inputs['start_positions'] = []
  inputs['end_positions'] = []

  for i, offsets in enumerate(offset_mapping):
    input_ids = inputs['input_ids'][i]
    # For no answer.
    cls_index = input_ids.index(tokenizer.cls_token_id)

    sequence_ids = inputs.sequence_ids(i)
    sample_index = sample_mapping[i]
    answer = data['answers'][sample_index]

    if len(answer['answer_start']) == 0:
      inputs['start_positions'].append(cls_index)
      inputs['end_positions'].append(cls_index)
    else:
      start_char = answer['answer_start'][0]
      end_char = start_char + len(answer['text'][0])

      # Find the start and end of the current feature's context in the sample text.
      token_start_index = 0
      while sequence_ids[token_start_index] != 1:
        token_start_index += 1

      token_end_index = len(input_ids) - 1
      while sequence_ids[token_end_index] != 1:
        token_end_index -= 1

      # If the answer is out of the current feature's context.
      if not(offsets[token_start_index][0] <= start_char and
             offsets[token_end_index][1] >= end_char):
        inputs['start_positions'].append(cls_index)
        inputs['end_positions'].append(cls_index)
      else:
        # Find tokens that correspond to the start and end of the answer.
        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
          token_start_index += 1
        inputs['start_positions'].append(token_start_index - 1)
        while offsets[token_end_index][1] >= end_char:
          token_end_index -= 1
        inputs['end_positions'].append(token_end_index + 1)

  return inputs

In [None]:
# Check if function works as expected.
result = preprocess_train_data(dataset['train'][:1])
print(tokenizer.decode(result['input_ids'][0][result['start_positions'][0]: result['end_positions'][0]+1]))
print(dataset['train'][0]['answers']['text'][0])

in the late 1990s
in the late 1990s


In [None]:
tokenized_dataset = dataset['train'].map(
    preprocess_train_data,
    batched=True,
    remove_columns=dataset['train'].column_names,
)
len(dataset['train']), len(tokenized_dataset)

(130319, 131958)

In [None]:
# Preprocessing.
def preprocess_validation_data(data):
  data['question'] = [q.strip() for q in data['question']]

  inputs = tokenizer(
      data['question'],
      data['context'],
      truncation='only_second',
      max_length=MAX_LENGTH,
      stride=STRIDE,
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
      padding='max_length',
  )

  # Map from a feature to its corresponding dataset sample.
  sample_mapping = inputs.pop('overflow_to_sample_mapping')
  inputs['example_ids'] = []

  for i in range(len(inputs['input_ids'])):
    sample_index = sample_mapping[i]
    inputs['example_ids'].append(data['id'][sample_index])

    # Set the question part offsets to None, to easily determine the context part.
    sequence_ids = inputs.sequence_ids(i)
    inputs['offset_mapping'][i] = [
        (o if sequence_ids[k] == 1 else None)
        for k, o in enumerate(inputs['offset_mapping'][i])
    ]

  return inputs

In [None]:
validation_dataset = dataset['validation'].map(
    preprocess_validation_data,
    batched=True,
    remove_columns=dataset['validation'].column_names,
)
len(dataset['validation']), len(validation_dataset)

(11873, 12171)

## Fine-tuning

Our fine-tuned models can be found in Google Drive shared folder [here](https://drive.google.com/drive/folders/1LjE8UzVeHCNoYPd6U9t-OrV2EDBeqR8W?usp=sharing).

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering

# Use already fine-tuned models instead of fine-tuning one.
if USE_SAVED:
  model = model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)
else:
  model = AutoModelForQuestionAnswering.from_pretrained(MODEL_CHECKPOINT)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    fp16=True,
)

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=validation_dataset,
)

In [None]:
if TRAIN:
  trainer.train()

In [None]:
if TRAIN:
  trainer.save_model(MODEL_CHECKPOINT+'-squadv2')

## Evaluation

In [None]:
raw_predictions = trainer.predict(validation_dataset)

In [None]:
# Get back `example_ids` and `offset_mapping` column.
validation_dataset.set_format(type=validation_dataset.format['type'], columns=list(validation_dataset.features.keys()))

In [None]:
import collections
import numpy as np
from tqdm.auto import tqdm

def postprocess_predictions(raw_predictions, features, examples):
  all_start_logits, all_end_logits = raw_predictions
  features_per_example = collections.defaultdict(list)
  for i, feature in enumerate(features):
    features_per_example[feature['example_ids']].append(i)

  predictions = collections.OrderedDict()
  for example in tqdm(examples):
    example_id = example['id']
    context = example['context']

    # Score of the impossible answer for the example.
    min_null_score = None
    answers = []

    for feature_index in features_per_example[example_id]:
      start_logit = all_start_logits[feature_index]
      end_logit = all_end_logits[feature_index]
      offset_mapping = features[feature_index]['offset_mapping']

      cls_index = features[feature_index]['input_ids'].index(tokenizer.cls_token_id)
      feature_null_score = start_logit[cls_index] + end_logit[cls_index]

      if min_null_score is None or min_null_score < feature_null_score:
        min_null_score = feature_null_score

      start_indexes = np.argsort(start_logit)[-1: -N_BEST - 1 : -1].tolist()
      end_indexes = np.argsort(end_logit)[-1: -N_BEST - 1 : -1].tolist()
      for start_index in start_indexes:
        for end_index in end_indexes:
          if (start_index >= len(offset_mapping)
              or end_index >= len(offset_mapping)
              or offset_mapping[start_index] is None
              or offset_mapping[end_index] is None
          ):
            continue
          if end_index < start_index or end_index-start_index+1>MAX_ANSWER_LENGTH:
            continue

          start_char = offset_mapping[start_index][0]
          end_char = offset_mapping[end_index][1]
          answers.append({
              'text': context[start_char : end_char],
              'logit_score': start_logit[start_index] + end_logit[end_index],
          })

    if len(answers) > 0:
      best_answer = max(answers, key=lambda x: x['logit_score'])
    else:
      best_answer = {'text': '', 'logit_score': 0.0}

    answer = best_answer['text'] if best_answer['logit_score'] > min_null_score else ''
    predictions[example_id] = answer

  return predictions


In [None]:
predictions = postprocess_predictions(raw_predictions.predictions, validation_dataset, dataset['validation'])

In [None]:
import json

# Save predictions as json file.
with open("predictions.json", "w") as json_file:
    json.dump(predictions, json_file)

In [None]:
from datasets import load_metric
metric = load_metric(DATASET, trust_remote_code=True)
formatted_predictions = [{'id': k, 'prediction_text': v, 'no_answer_probability': 0.0} for k, v in predictions.items()]
theoretical_answers = [{'id': ex['id'], 'answers': ex['answers']} for ex in dataset['validation']]
metric.compute(predictions=formatted_predictions, references=theoretical_answers)

## Results

In [None]:
import torch

In [None]:
def answer_question(question, context, model):
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)

    outputs = model(**inputs.to(device))

    # Get the most likely start and end tokens
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores) + 1

    # Get score for no answer case
    no_answer_score = start_scores[0][0].item() + end_scores[0][0].item()

    # Check if the predicted span is valid and is not no answer case
    max_start_score = start_scores[0][start_index].item()
    max_end_score = end_scores[0][end_index - 1].item()
    if start_index < len(inputs.input_ids[0]) and end_index <= len(inputs.input_ids[0]) and (max_start_score + max_end_score) > no_answer_score:
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0][start_index:end_index]))
    else:
        answer = "No answer"

    return answer

In [None]:
# some test cases for demonstration
test_cases = [
    {
        "context": "The Eiffel Tower is located in Paris.",
        "question": "Where is the Eiffel Tower located?",
        "expected_answer": "Paris"
    },
    {
        "context": "Albert Einstein developed the theory of relativity.",
        "question": "Who developed the theory of relativity?",
        "expected_answer": "Albert Einstein"
    },
    {
        "context": "The Declaration of Independence was signed in 1776.",
        "question": "When was the Declaration of Independence signed?",
        "expected_answer": "1776"
    },
    {
        "context": "Water boils at 100 degrees Celsius.",
        "question": "What is the capital of Australia?",
        "expected_answer": "No answer"
    },
    {
        "context": "The Great Wall of China is a series of fortifications that were built across the northern borders of China to protect and consolidate territories of Chinese states and empires against various nomadic groups of the steppe and their polities. Several walls were being built as early as the 7th century BC; these, later joined together and made bigger and stronger, are now collectively referred to as the Great Wall.",
        "question": "Where is the Great Wall located?",
        "expected_answer": "7th century BC"
    },
    {
        "context": "Despite being a well-known physicist, Isaac Newton also made significant contributions to mathematics, including the development of calculus.",
        "question": "What did Isaac Newton develop in mathematics?",
        "expected_answer": "calculus"
    },
    {
        "context": "Amazon was founded by Jeff Bezos in 1994. Initially started as an online bookstore, it has since expanded to a wide variety of products and services.",
        "question": "Who founded Amazon?",
        "expected_answer": "Jeff Bezos"
    },
    {
        "context": "Marie Curie was a physicist and chemist who conducted pioneering research on radioactivity. She was the first woman to win a Nobel Prize.",
        "question": "Who was the first woman to win a Nobel Prize?",
        "expected_answer": "Marie Curie"
    },
    {
        "context": "Paris is a major European city and a global center for art, fashion, gastronomy, and culture. It is the capital of France.",
        "question": "What is Paris known for?",
        "expected_answer": "art, fashion, gastronomy, and culture"
    },
    {
        "context": "Mount Everest is 8,848 meters tall.",
        "question": "How tall is Mount Everest?",
        "expected_answer": "8,848 meters"
    }
]

while True:
    context = input("Enter a context: ")
    question = input("Enter a question: ")

    answer = answer_question(question, context, model)
    print()
    print(f"Context: {context}")
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print()
    print()
