<a href="https://colab.research.google.com/github/ArijaK/QuestionAnswering/blob/main/QA_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Flags.
# Data preprocessing.
DATASET = 'squad_v2'
MODEL_CHECKPOINT = 'deepset/roberta-base-squad2'
# Maximum length of a feature (question and context).
MAX_LENGTH = 512
# Number of overlapping tokens.
STRIDE = 128

## Data preparation

In [2]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2

In [3]:
# Easy way to load the dataset.
from datasets import load_dataset
dataset = load_dataset(DATASET)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [4]:
# Preprocessing.
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [5]:
# Check if a fast tokenizer is implemented.
assert tokenizer.is_fast

In [6]:
# Call the tokenizer on a simple example.
inputs = tokenizer('Where can I buy cat food?', 'Cat food is sold in all pet stores. You can buy cat food online too.')
tokenizer.decode(inputs['input_ids'])

'<s>Where can I buy cat food?</s></s>Cat food is sold in all pet stores. You can buy cat food online too.</s>'

In [7]:
def preprocess_data(data):
  # Remove leading and trailing whitespaces.
  data['question'] = [q.strip() for q in data['question']]

  inputs = tokenizer(
      data['question'],
      data['context'],
      truncation='only_second',
      max_length=MAX_LENGTH,
      stride=STRIDE,
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
      padding='max_length',
  )

  # Map from a feature to its corresponding dataset sample.
  sample_mapping = inputs.pop('overflow_to_sample_mapping')
  # Map from token to character position in the original context.
  offset_mapping = inputs.pop('offset_mapping')

  inputs['start_positions'] = []
  inputs['end_positions'] = []

  for i, offsets in enumerate(offset_mapping):
    input_ids = inputs['input_ids'][i]
    # For no answer.
    cls_index = input_ids.index(tokenizer.cls_token_id)

    sequence_ids = inputs.sequence_ids(i)
    sample_index = sample_mapping[i]
    answer = data['answers'][sample_index]

    if len(answer['answer_start']) == 0:
      inputs['start_positions'].append(cls_index)
      inputs['end_positions'].append(cls_index)
    else:
      start_char = answer['answer_start'][0]
      end_char = start_char + len(answer['text'][0])

      # Find the start and end of the current feature's context in the sample text.
      token_start_index = 0
      while sequence_ids[token_start_index] != 1:
        token_start_index += 1

      token_end_index = len(input_ids) - 1
      while sequence_ids[token_end_index] != 1:
        token_end_index -= 1

      # If the answer is out of the current feature's context.
      if not(offsets[token_start_index][0] <= start_char and
             offsets[token_end_index][1] >= end_char):
        inputs['start_positions'].append(cls_index)
        inputs['end_positions'].append(cls_index)
      else:
        # Find tokens that correspond to the start and end of the answer.
        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
          token_start_index += 1
        inputs['start_positions'].append(token_start_index - 1)
        while offsets[token_end_index][1] >= end_char:
          token_end_index -= 1
        inputs['end_positions'].append(token_end_index + 1)

  return inputs

In [17]:
# Check if function works as expected.
result = preprocess_data(dataset['train'][:1])
print(tokenizer.decode(result['input_ids'][0][result['start_positions'][0]: result['end_positions'][0]+1]))
print(dataset['train'][0]['answers']['text'][0])

 in the late 1990s
in the late 1990s


In [18]:
tokenized_dataset = dataset['train'].map(
    preprocess_data,
    batched=True,
    remove_columns=dataset['train'].column_names,
)
len(dataset['train']), len(tokenized_dataset)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

(130319, 130550)