In [1]:
# from transformers import pipeline
# question_answerer = pipeline("question-answering", model='distilbert-base-uncased-distilled-squad')

# context = r"""
# Extractive Question Answering is the task of extracting an answer from a text given a question. An example  of a
# question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
# a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
# """

# result = question_answerer(question="What is a good example of a question answering dataset?",     context=context)
# print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [2]:
# from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
# import tensorflow as tf

# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
# model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

# question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

# inputs = tokenizer(question, text, return_tensors="tf")
# outputs = model(**inputs)

# answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
# answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

# predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
# tokenizer.decode(predict_answer_tokens)

In [3]:
from transformers import DistilBertTokenizerFast, TFDistilBertForQuestionAnswering
import tensorflow as tf
import json

In [4]:
def load_squad(path):
    with open(path, 'r') as file:
        squad_dict = json.load(file)
    return squad_dict

In [5]:
# def prepare_train_features(examples, tokenizer, max_length=384, doc_stride=128):
#     tokenized_examples = tokenizer(
#         examples['question'],
#         examples['context'],
#         max_length=max_length,
#         truncation="only_second",
#         stride=doc_stride,
#         return_overflowing_tokens=True,
#         return_offsets_mapping=True,
#         padding="max_length",
#     )

#     sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
#     offset_mapping = tokenized_examples.pop("offset_mapping")

#     tokenized_examples["start_positions"] = []
#     tokenized_examples["end_positions"] = []
#     tokenized_examples["example_id"] = []

#     for i, offsets in enumerate(offset_mapping):
#         input_ids = tokenized_examples["input_ids"][i]
#         cls_index = input_ids.index(tokenizer.cls_token_id)

#         sequence_ids = tokenized_examples.sequence_ids(i)

#         sample_index = sample_mapping[i]
#         answers = examples["answers"][sample_index]
#         if len(answers["answer_start"]) == 0:
#             tokenized_examples["start_positions"].append(cls_index)
#             tokenized_examples["end_positions"].append(cls_index)
#         else:
#             start_char = answers["answer_start"][0]
#             end_char = start_char + len(answers["text"][0])

#             token_start_index = 0
#             while sequence_ids[token_start_index] != 1:
#                 token_start_index += 1

#             token_end_index = len(input_ids) - 1
#             while sequence_ids[token_end_index] != 1:
#                 token_end_index -= 1

#             if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
#                 tokenized_examples["start_positions"].append(cls_index)
#                 tokenized_examples["end_positions"].append(cls_index)
#             else:
#                 while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
#                     token_start_index += 1
#                 tokenized_examples["start_positions"].append(token_start_index - 1)

#                 while offsets[token_end_index][1] >= end_char:
#                     token_end_index -= 1
#                 tokenized_examples["end_positions"].append(token_end_index + 1)

#         tokenized_examples["example_id"].append(examples["id"][sample_index])

#     return tokenized_examples


In [6]:
def prepare_train_features(examples, tokenizer, max_length=384, doc_stride=128):
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        max_length=max_length,
        truncation="only_second",
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    tokenized_examples["example_id"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

        tokenized_examples["example_id"].append(examples["id"][sample_index])

    return tokenized_examples


In [7]:
def extract_examples(squad_dict):
    contexts = []
    questions = []
    ids = []
    answers = []

    for group in squad_dict['data']:
        for paragraph in group['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                id = qa['id']
                contexts.append(context)
                questions.append(question)
                ids.append(id)

                if 'answers' in qa:
                    answer_starts = [answer['answer_start'] for answer in qa['answers']]
                    answer_texts = [answer['text'] for answer in qa['answers']]
                    answers.append({'answer_start': answer_starts, 'text': answer_texts})
                else:
                    answers.append({'answer_start': [], 'text': []})

    return {"context": contexts, "question": questions, "id": ids, "answers": answers}


In [8]:
# def extract_examples(squad_dict):
#     contexts = []
#     questions = []
#     ids = []
#     answers = []

#     for group in squad_dict['data']:
#         for paragraph in group['paragraphs']:
#             context = paragraph['context']
#             for qa in paragraph['qas']:
#                 question = qa['question']
#                 id = qa['id']
#                 contexts.append(context)
#                 questions.append(question)
#                 ids.append(id)

#                 if 'answers' in qa:
#                     answer_starts = [answer['answer_start'] for answer in qa['answers']]
#                     answer_texts = [answer['text'] for answer in qa['answers']]
#                     answers.append({'answer_start': answer_starts, 'text': answer_texts})
#                 else:
#                     answers.append({'answer_start': [], 'text': []})

#     return {"context": contexts, "question": questions, "id": ids, "answers": answers}


In [9]:
dev_path = '/content/dev-v1.1.json'
train_path = '/content/train-v1.1.json'

dev_squad_dict = load_squad(dev_path)
train_squad_dict = load_squad(train_path)

dev_examples = extract_examples(dev_squad_dict)
train_examples = extract_examples(train_squad_dict)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

dev_features = prepare_train_features(dev_examples, tokenizer)
train_features = prepare_train_features(train_examples, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [10]:
# def create_tf_dataset(features):
#     def gen():
#         for i in range(len(features["input_ids"])):
#             yield (
#                 {
#                     "input_ids": features["input_ids"][i],
#                     "attention_mask": features["attention_mask"][i],
#                     "token_type_ids": features["token_type_ids"][i]
#                 },
#                 {
#                     "start_positions": features["start_positions"][i],
#                     "end_positions": features["end_positions"][i]
#                 }
#             )

#     return tf.data.Dataset.from_generator(
#         gen,
#         ({
#             "input_ids": tf.int32,
#             "attention_mask": tf.int32,
#             "token_type_ids": tf.int32
#         },
#         {
#             "start_positions": tf.int32,
#             "end_positions": tf.int32
#         })
#     )

# train_dataset = create_tf_dataset(train_features)
# dev_dataset = create_tf_dataset(dev_features)

# train_dataset = train_dataset.shuffle(1000).batch(8)
# dev_dataset = dev_dataset.batch(8)


In [11]:
# def create_tf_dataset(features):
#     def gen():
#         for i in range(len(features["input_ids"])):
#             yield (
#                 {
#                     "input_ids": features["input_ids"][i],
#                     "attention_mask": features["attention_mask"][i]
#                 },
#                 {
#                     "start_positions": features["start_positions"][i],
#                     "end_positions": features["end_positions"][i]
#                 }
#             )

#     return tf.data.Dataset.from_generator(
#         gen,
#         ({
#             "input_ids": tf.int32,
#             "attention_mask": tf.int32
#         },
#         {
#             "start_positions": tf.int32,
#             "end_positions": tf.int32
#         })
#     )


In [12]:
# def create_tf_dataset(features):
#     def gen():
#         for i in range(len(features["input_ids"])):
#             yield (
#                 {
#                     "input_ids": features["input_ids"][i],
#                     "attention_mask": features["attention_mask"][i]
#                 },
#                 {
#                     "start_positions": features["start_positions"][i],
#                     "end_positions": features["end_positions"][i]
#                 }
#             )

#     return tf.data.Dataset.from_generator(
#         gen,
#         ({
#             "input_ids": tf.int32,
#             "attention_mask": tf.int32
#         },
#         {
#             "start_positions": tf.int32,
#             "end_positions": tf.int32
#         })
#     )

# train_dataset = create_tf_dataset(train_features)
# dev_dataset = create_tf_dataset(dev_features)

# train_dataset = train_dataset.shuffle(1000).batch(8)
# dev_dataset = dev_dataset.batch(8)

In [13]:
def create_tf_dataset(features):
    def gen():
        for i in range(len(features["input_ids"])):
            yield (
                {
                    "input_ids": features["input_ids"][i],
                    "attention_mask": features["attention_mask"][i]
                },
                {
                    "start_positions": features["start_positions"][i],
                    "end_positions": features["end_positions"][i]
                }
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({
            "input_ids": tf.int32,
            "attention_mask": tf.int32
        },
        {
            "start_positions": tf.int32,
            "end_positions": tf.int32
        })
    )


In [14]:
train_dataset = create_tf_dataset(train_features)
dev_dataset = create_tf_dataset(dev_features)

# train_dataset = train_dataset.shuffle(1000).batch(8)
# dev_dataset = dev_dataset.batch(8)
train_dataset = train_dataset.shuffle(1000).batch(8).prefetch(tf.data.experimental.AUTOTUNE)
dev_dataset = dev_dataset.batch(8).prefetch(tf.data.experimental.AUTOTUNE)


Instructions for updating:
Use output_signature instead


In [15]:
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it

In [16]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)

In [18]:
import tensorflow as tf

# Debugging input shapes
print(f"input_ids shape: {input_ids.shape}")
print(f"attention_mask shape: {attention_mask.shape}")

# Assert shapes
tf.debugging.assert_rank(input_ids, 2, message="input_ids should have rank 2")
tf.debugging.assert_rank(attention_mask, 2, message="attention_mask should have rank 2")

# Model call with debugging
try:
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, training=True)
except ValueError as e:
    print(f"Error during model call: {e}")


NameError: name 'input_ids' is not defined

In [17]:
model.fit(train_dataset, epochs=3, validation_data=dev_dataset)

Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1370, in run_step  *
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1672, in train_step  *
        y_pred = self(x, training=True)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
        return fn(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 588, in __call__  *
        return super().__call__(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
        return fn(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/base_layer.py", line 1136, in __call__  *
        outputs = call_fn(inputs, *args, **kwargs)
    File "/tmp/__autograph_generated_file2odidxg8.py", line 162, in error_handler
        raise ag__.converted_call(ag__.ld(new_e).with_traceback, (ag__.ld(e).__traceback__,), None, fscope_1) from None
    File "/tmp/__autograph_generated_file2odidxg8.py", line 34, in error_handler
        retval__1 = ag__.converted_call(ag__.ld(fn), tuple(ag__.ld(args)), dict(**ag__.ld(kwargs)), fscope_1)
    File "/tmp/__autograph_generated_filear39ja04.py", line 31, in tf__call
        ndims = ag__.converted_call(ag__.ld(len), (ag__.ld(input_shape),), None, fscope)

    ValueError: Exception encountered when calling layer 'tf_distil_bert_for_question_answering' (type TFDistilBertForQuestionAnswering).
    
    in user code:
    
        File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1078, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "/usr/local/lib/python3.10/dist-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 1091, in call  *
            distilbert_output = self.distilbert(
        File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
            return fn(*args, **kwargs)
        File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/base_layer.py", line 1136, in __call__  *
            outputs = call_fn(inputs, *args, **kwargs)
        File "/tmp/__autograph_generated_file2odidxg8.py", line 162, in error_handler
            raise ag__.converted_call(ag__.ld(new_e).with_traceback, (ag__.ld(e).__traceback__,), None, fscope_1) from None
        File "/tmp/__autograph_generated_file2odidxg8.py", line 34, in error_handler
            retval__1 = ag__.converted_call(ag__.ld(fn), tuple(ag__.ld(args)), dict(**ag__.ld(kwargs)), fscope_1)
        File "/tmp/__autograph_generated_filear39ja04.py", line 31, in tf__call
            ndims = ag__.converted_call(ag__.ld(len), (ag__.ld(input_shape),), None, fscope)
    
        ValueError: Exception encountered when calling layer 'distilbert' (type TFDistilBertMainLayer).
        
        in user code:
        
            File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1078, in run_call_with_unpacked_inputs  *
                return func(self, **unpacked_inputs)
            File "/usr/local/lib/python3.10/dist-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 454, in call  *
                embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)  # (bs, seq_length, dim)
            File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
                return fn(*args, **kwargs)
            File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/base_layer.py", line 1136, in __call__  *
                outputs = call_fn(inputs, *args, **kwargs)
            File "/tmp/__autograph_generated_file2odidxg8.py", line 162, in error_handler
                raise ag__.converted_call(ag__.ld(new_e).with_traceback, (ag__.ld(e).__traceback__,), None, fscope_1) from None
            File "/tmp/__autograph_generated_file2odidxg8.py", line 34, in error_handler
                retval__1 = ag__.converted_call(ag__.ld(fn), tuple(ag__.ld(args)), dict(**ag__.ld(kwargs)), fscope_1)
            File "/tmp/__autograph_generated_filear39ja04.py", line 31, in tf__call
                ndims = ag__.converted_call(ag__.ld(len), (ag__.ld(input_shape),), None, fscope)
        
            ValueError: Exception encountered when calling layer 'embeddings' (type TFEmbeddings).
            
            in user code:
            
                File "/usr/local/lib/python3.10/dist-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 119, in call  *
                    final_embeddings = self.LayerNorm(inputs=final_embeddings)
                File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
                    return fn(*args, **kwargs)
                File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/base_layer.py", line 1136, in __call__  *
                    outputs = call_fn(inputs, *args, **kwargs)
                File "/tmp/__autograph_generated_file2odidxg8.py", line 162, in error_handler  **
                    raise ag__.converted_call(ag__.ld(new_e).with_traceback, (ag__.ld(e).__traceback__,), None, fscope_1) from None
                File "/tmp/__autograph_generated_file2odidxg8.py", line 34, in error_handler
                    retval__1 = ag__.converted_call(ag__.ld(fn), tuple(ag__.ld(args)), dict(**ag__.ld(kwargs)), fscope_1)
                File "/tmp/__autograph_generated_filear39ja04.py", line 31, in tf__call  **
                    ndims = ag__.converted_call(ag__.ld(len), (ag__.ld(input_shape),), None, fscope)
            
                ValueError: Exception encountered when calling layer 'LayerNorm' (type LayerNormalization).
                
                Cannot take the length of shape with unknown rank.
                
                Call arguments received by layer 'LayerNorm' (type LayerNormalization):
                  • inputs=tf.Tensor(shape=<unknown>, dtype=float32)
            
            
            Call arguments received by layer 'embeddings' (type TFEmbeddings):
              • input_ids=tf.Tensor(shape=<unknown>, dtype=int32)
              • position_ids=None
              • inputs_embeds=None
              • training=True
        
        
        Call arguments received by layer 'distilbert' (type TFDistilBertMainLayer):
          • input_ids=tf.Tensor(shape=<unknown>, dtype=int32)
          • attention_mask=tf.Tensor(shape=<unknown>, dtype=int32)
          • head_mask=None
          • inputs_embeds=None
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=True
    
    
    Call arguments received by layer 'tf_distil_bert_for_question_answering' (type TFDistilBertForQuestionAnswering):
      • input_ids={'input_ids': 'tf.Tensor(shape=<unknown>, dtype=int32)', 'attention_mask': 'tf.Tensor(shape=<unknown>, dtype=int32)'}
      • attention_mask=None
      • head_mask=None
      • inputs_embeds=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • start_positions=None
      • end_positions=None
      • training=True


In [None]:
loss, accuracy = model.evaluate(dev_dataset)
print(f"Validation loss: {loss}")