In [1]:
import pandas as pd
from datasets import load_dataset

from transformers import BertTokenizerFast, TFBertModel

2021-12-25 11:44:03.707538: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-25 11:44:03.707582: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Loading Dataset

In [2]:
dataset = load_dataset('squad')

Reusing dataset squad (/home/aleksandar/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
train = pd.DataFrame().from_dict(dataset['train'])
test = pd.DataFrame().from_dict(dataset['validation'])

## Data Description 

Context - Paragraph in which the answer should be searched
Question - Question to be answered
Answer - Real answer to the question containing the starting position of the answer in the context.

Our task is to build a model which outputs the answer starting position in the context.

# Train-Test data preparation

Train data

In [4]:
train_contexts = train['context'].values.tolist()
train_questions = train['question'].values.tolist()
train_answers = train['answers'].values.tolist()

Test data

In [5]:
test_contexts = test['context'].values.tolist()
test_questions = test['question'].values.tolist()
test_answers = test['answers'].values.tolist()

For given answer and context, the end index of the answer in the context should be determined. Utility function `calculate_end_index` is used.

In [6]:
def calculate_end_index(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text'][0]
        start_idx = answer['answer_start'][0]
        end_idx = start_idx + len(gold_text)

        answer['text'] = gold_text

        if context[start_idx:end_idx] == gold_text:
            answer['answer_start'] = start_idx
            answer['answer_end'] = end_idx
        elif context[start_idx - 1:end_idx - 1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        elif context[start_idx - 2:end_idx - 2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

In [7]:
calculate_end_index(train_answers, train_contexts)
calculate_end_index(test_answers, test_contexts)

Observe the calculated end indexes of the answers in the test data.

In [8]:
print(train_contexts[0],train_answers[0])

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary. {'text': 'Saint Bernadette Soubirous', 'answer_start': 515, 'answer_end': 541}


# Integration of BERT with Keras

Tokenizing questions and contexts

In [9]:
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [10]:
train_encodings = bert_tokenizer(train_contexts, train_questions,
                                 max_length=100, truncation=True,pad_to_max_length=True)



In [11]:
train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']

train_start_positions, train_end_positions = \
[train_encodings.char_to_token(idx,answer['answer_start']) or bert_tokenizer.model_max_length for idx, answer in enumerate(train_answers)], \
[train_encodings.char_to_token(idx,answer['answer_end']) or bert_tokenizer.model_max_length for idx, answer in enumerate(train_answers)]

In [12]:
test_encodings = bert_tokenizer(test_contexts, test_questions,
                                 max_length=100, truncation=True,pad_to_max_length=True)

In [19]:
test_input_ids = test_encodings['input_ids']
test_attention_masks = test_encodings['attention_mask']

test_start_positions, test_end_positions = \
[test_encodings.char_to_token(idx,answer['answer_start']) or bert_tokenizer.model_max_length for idx, answer in enumerate(test_answers)], \
[test_encodings.char_to_token(idx,answer['answer_end']) or bert_tokenizer.model_max_length for idx, answer in enumerate(test_answers)]

Defining the model

In [14]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import mean_squared_error

import numpy as np

In [15]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

2021-12-25 11:44:37.382888: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (100)
2021-12-25 11:44:37.382936: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (legion-y540): /proc/driver/nvidia/version does not exist
2021-12-25 11:44:37.383203: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from

In [16]:
input_ids = Input(shape=(100,), name="input_token", dtype='int32')
att_masks = Input(shape=(100,), name="masked_token", dtype='int32')

In order to integreate keras model with BERT model, theese two layers need to be connected to the BERT model. The first layer from the connection is ignored, so we take the second layer from the BERT model.

In [17]:
bert_in = bert_model(input_ids, attention_mask=att_masks)[1]

In [18]:
start = Dense(1, activation='relu', name='start')(bert_in)
end = Dense(1, activation='relu', name='end')(bert_in)

In [20]:
bert_qa_model = Model(inputs=[input_ids, att_masks], outputs=[start, end])
bert_qa_model.compile(optimizer=Adam(lr=0.001),
                      loss=mean_squared_error, metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [None]:
bert_qa_model.fit([np.array(train_input_ids), np.array(train_attention_masks)],
                  [np.array(train_start_positions), np.array(train_end_positions)], epochs=5)

In [None]:
bert_qa_model.evaluate([np.array(test_input_ids), np.array(test_attention_masks)],
                       [np.array(test_start_positions), np.array(test_end_positions)])