In [1]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering, DefaultDataCollator, create_optimizer
import datasets
import tensorflow as tf

In [2]:
def load_train_val_test_datasets(dataset_path='./../data/datasets/squad'):
    train = pd.read_csv(f'{dataset_path}/train.csv').dropna()
    val = pd.read_csv(f'{dataset_path}/dev.csv').dropna()
    test = pd.read_csv(f'{dataset_path}/test.csv').dropna()
    return train, val, test


def convert_dataframes_to_datasets(dataframes: list):
    return tuple(
        [datasets.Dataset.from_pandas(dataframe, preserve_index=False) for dataframe in
         dataframes])


df_train, df_val, df_test = load_train_val_test_datasets()

train_dataset, val_dataset, test_dataset = convert_dataframes_to_datasets([df_train, df_val, df_test])

  if _pandas_api.is_sparse(col):


In [3]:
train_dataset, val_dataset, test_dataset

(Dataset({
     features: ['index', 'id', 'context', 'question', 'answer_text', 'answer_start'],
     num_rows: 68716
 }),
 Dataset({
     features: ['index', 'id', 'context', 'question', 'answer_text', 'answer_start'],
     num_rows: 14724
 }),
 Dataset({
     features: ['index', 'id', 'context', 'question', 'answer_text', 'answer_start'],
     num_rows: 14725
 }))

In [4]:
model_checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
def tokenize_sample(sample, max_tokens=None, padding=False):
    question = sample['question'].strip()
    context = sample['context'].strip()

    return tokenizer(
        question,
        context,
        max_length=max_tokens,
        padding=padding
    )


tokenized_train_dataset = train_dataset.map(tokenize_sample)
tokenized_val_dataset = val_dataset.map(tokenize_sample)
tokenized_test_dataset = test_dataset.map(tokenize_sample)

print('Max number of tokens in tokenized train dataset: ', len(max(tokenized_train_dataset['input_ids'], key=len)))
print('Max number of tokens in tokenized val dataset: ', len(max(tokenized_val_dataset['input_ids'], key=len)))
print('Max number of tokens in tokenized test dataset: ', len(max(tokenized_test_dataset['input_ids'], key=len)))

Map:   0%|          | 0/68716 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (552 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/14724 [00:00<?, ? examples/s]

Map:   0%|          | 0/14725 [00:00<?, ? examples/s]

Max number of tokens in tokenized train dataset:  882
Max number of tokens in tokenized val dataset:  879
Max number of tokens in tokenized test dataset:  831


In [6]:
max_length = 384


def filter_samples_below_number_of_tokens(dataset, max_tokens: int):
    indices_to_remove = []

    # Find indices of samples where number of tokens exceeds max number of tokens
    for index, sample in enumerate(dataset):
        tokenized_sample = tokenize_sample(sample)
        if len(tokenized_sample['input_ids']) > max_tokens:
            indices_to_remove.append(index)

    # Keep only samples with number of tokens less or equal than max number of tokens
    dataset_indices = range(len(dataset))
    filtered_dataset = dataset.select(
        index for index in dataset_indices if index not in set(indices_to_remove)
    )

    return filtered_dataset


filtered_train_dataset = filter_samples_below_number_of_tokens(train_dataset, max_tokens=max_length)
filtered_val_dataset = filter_samples_below_number_of_tokens(val_dataset, max_tokens=max_length)
filtered_test_dataset = filter_samples_below_number_of_tokens(test_dataset, max_tokens=max_length)



In [7]:
filtered_train_dataset['question'][0]

'Along with the United Democratic Party, what party currently rules the Marshall Islands?'

In [8]:
start_char = filtered_train_dataset['answer_start'][0]
start_char

633

In [9]:
answer_text = filtered_train_dataset['answer_text'][0]
answer_text

'the AKA'

In [10]:
end_char = start_char + len(answer_text)
end_char

640

In [11]:
filtered_train_dataset['id'][0]

'56f961049b226e1400dd13eb'

In [29]:
print('Number of samples in tokenized train dataset before filtering: ', len(train_dataset))
print('Number of samples in tokenized val dataset before filtering: ', len(val_dataset))
print('Number of samples in tokenized test dataset before filtering: ', len(test_dataset))

print('\n---------------\n')

print('Number of samples in tokenized train dataset after filtering: ', len(filtered_train_dataset))
print('Number of samples in tokenized val dataset after filtering: ', len(filtered_val_dataset))
print('Number of samples in tokenized test dataset after filtering: ', len(filtered_test_dataset))

Number of samples in tokenized train dataset before filtering:  68716
Number of samples in tokenized val dataset before filtering:  14724
Number of samples in tokenized test dataset before filtering:  14725

---------------

Number of samples in tokenized train dataset after filtering:  67801
Number of samples in tokenized val dataset after filtering:  14541
Number of samples in tokenized test dataset after filtering:  14519


In [12]:
filtered_train_dataset

Dataset({
    features: ['index', 'id', 'context', 'question', 'answer_text', 'answer_start'],
    num_rows: 67801
})

In [13]:
def tokenize(examples):
    questions = [q.strip() for q in examples['question']]
    # contexts = [c.strip() for c in examples['context']]

    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=max_length,
        padding='max_length'
    )

    return inputs

b = filtered_train_dataset.map(tokenize, batched=True)
b

Map:   0%|          | 0/67801 [00:00<?, ? examples/s]

Dataset({
    features: ['index', 'id', 'context', 'question', 'answer_text', 'answer_start', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 67801
})

In [24]:
# def tokenize(examples):
#     questions = [q.strip() for q in examples['question']]
#     # contexts = [c.strip() for c in examples['context']]

#     inputs = tokenizer(
#         questions,
#         examples['context'],
#         max_length=max_length,
#         padding='max_length'
#     )

#     return inputs


def preprocess_dataset(dataset):
    questions = [q.strip() for q in dataset['question']]
    # contexts = [c.strip() for c in examples['context']]

    inputs = tokenizer(
        questions,
        dataset['context'],
        max_length=max_length,
        padding='max_length',
        return_overflowing_tokens=True,
    )
    # sample_map = inputs.pop("overflow_to_sample_mapping")
    # for key, values in dataset.items():
    #     inputs[key] = [values[i] for i in sample_map]
    # dataset = dataset.map(tokenize, batched=True)

    answer_start_indices = dataset['answer_start']
    answer_texts = dataset['answer_text']

    start_positions = []
    end_positions = []

    for index, _ in enumerate(inputs):
        start_char = answer_start_indices[index]
        end_char = start_char + len(answer_texts[index])

        start_positions.append(start_char)
        end_positions.append(end_char)

    # inputs = inputs.add_column('start_positions', start_positions)
    # inputs = inputs.add_column('end_positions', end_positions)
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


tokenized_train_dataset = preprocess_dataset(filtered_train_dataset)
tokenized_val_dataset = preprocess_dataset(filtered_val_dataset)
tokenized_test_dataset = preprocess_dataset(filtered_test_dataset)

In [20]:
filtered_train_dataset

Dataset({
    features: ['index', 'id', 'context', 'question', 'answer_text', 'answer_start'],
    num_rows: 67801
})

In [25]:
tokenized_train_dataset = filtered_train_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_train_dataset.column_names,
)
len(filtered_train_dataset), len(train_dataset)

Map:   0%|          | 0/67801 [00:00<?, ? examples/s]

ArrowInvalid: Column 4 named start_positions expected length 1000 but got length 4

In [74]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
print(f'All tokenized train dataset entries have {max_length} tokens: ',
      all([len(input_ids) == max_length for input_ids in tokenized_train_dataset['input_ids']]))
print(f'All tokenized val dataset entries have {max_length} tokens: ',
      all([len(input_ids) == max_length for input_ids in tokenized_val_dataset['input_ids']]))
print(f'All tokenized test dataset entries have {max_length} tokens: ',
      all([len(input_ids) == max_length for input_ids in tokenized_test_dataset['input_ids']]))

All tokenized train dataset entries have 384 tokens:  True
All tokenized val dataset entries have 384 tokens:  True
All tokenized test dataset entries have 384 tokens:  True


In [91]:
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
data_collator = DefaultDataCollator(return_tensors='tf')

In [34]:
tokenized_train_dataset['start_positions']

[633,
 494,
 22,
 53,
 585,
 287,
 121,
 391,
 565,
 141,
 396,
 629,
 49,
 7,
 453,
 108,
 35,
 560,
 130,
 181,
 178,
 26,
 393,
 749,
 413,
 35,
 167,
 0,
 63,
 29,
 808,
 486,
 89,
 92,
 0,
 173,
 342,
 821,
 170,
 83,
 367,
 441,
 533,
 427,
 197,
 124,
 42,
 168,
 192,
 455,
 372,
 20,
 404,
 141,
 979,
 410,
 87,
 579,
 1076,
 533,
 386,
 348,
 173,
 550,
 191,
 287,
 714,
 256,
 534,
 305,
 0,
 207,
 269,
 327,
 1287,
 463,
 36,
 129,
 302,
 41,
 19,
 619,
 1144,
 325,
 599,
 150,
 447,
 77,
 498,
 92,
 243,
 62,
 693,
 119,
 0,
 465,
 473,
 258,
 397,
 428,
 172,
 52,
 539,
 602,
 0,
 474,
 477,
 183,
 485,
 364,
 82,
 104,
 116,
 129,
 373,
 55,
 199,
 125,
 397,
 293,
 270,
 19,
 51,
 509,
 113,
 311,
 81,
 546,
 736,
 717,
 324,
 43,
 425,
 303,
 1222,
 782,
 685,
 377,
 67,
 712,
 86,
 36,
 163,
 44,
 10,
 178,
 242,
 729,
 870,
 229,
 97,
 235,
 921,
 842,
 114,
 158,
 7,
 475,
 605,
 226,
 776,
 310,
 417,
 416,
 110,
 117,
 342,
 583,
 116,
 535,
 127,
 484,
 249,
 84,


In [82]:
tokenized_train_dataset

Dataset({
    features: ['index', 'id', 'context', 'question', 'answer_text', 'answer_start', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 67801
})

In [35]:
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_train_dataset,
    collate_fn=data_collator,
    shuffle=True,
    batch_size=8,
)
tf_val_dataset = model.prepare_tf_dataset(
    tokenized_val_dataset,
    collate_fn=data_collator,
    shuffle=False,
    batch_size=8,
)

In [36]:
tf_train_dataset

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(8, 384), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(8, 384), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(8, 384), dtype=tf.int64, name=None)}, {'start_positions': TensorSpec(shape=(8,), dtype=tf.int64, name=None), 'end_positions': TensorSpec(shape=(8,), dtype=tf.int64, name=None)})>

In [37]:
# <_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(16, 384), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(16, 384), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, 384), dtype=tf.int64, name=None)}, {'start_positions': TensorSpec(shape=(16,), dtype=tf.int64, name=None), 'end_positions': TensorSpec(shape=(16,), dtype=tf.int64, name=None)})>

In [None]:
tf_train_dataset = tokenized_train_dataset.to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols=['start_positions', 'end_positions'],
    batch_size=8,
    shuffle=False
)

tf_val_dataset = tokenized_val_dataset.to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols=['start_positions', 'end_positions'],
    batch_size=8,
    shuffle=False
)

In [None]:
tf_train_dataset

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 384), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 384), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 384), dtype=tf.int64, name=None)}, {'start_positions': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'end_positions': TensorSpec(shape=(None,), dtype=tf.int64, name=None)})>

In [96]:
num_train_epochs = 3
num_train_steps = len(tf_train_dataset) * num_train_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [66]:
train_dataset["question"][0]


'Along with the United Democratic Party, what party currently rules the Marshall Islands?'

In [67]:
train_dataset["context"][0]


"Legislative power lies with the Nitijela. The upper house of Parliament, called the Council of Iroij, is an advisory body comprising twelve tribal chiefs. The executive branch consists of the President and the Presidential Cabinet, which consists of ten ministers appointed by the President with the approval of the Nitijela. The twenty-four electoral districts into which the country is divided correspond to the inhabited islands and atolls. There are currently four political parties in the Marshall Islands: Aelon̄ Kein Ad (AKA), United People's Party (UPP), Kien Eo Am (KEA) and United Democratic Party (UDP). Rule is shared by the AKA and the UDP. The following senators are in the legislative body:"

In [92]:
input_test = tokenizer(train_dataset["question"][0], train_dataset["context"][0], return_tensors="tf")
output = model(**input_test)

answer_start_index = int(tf.math.argmax(output.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(output.end_logits, axis=-1)[0])

predict_answer_tokens = input_test.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

''

In [93]:
input_test = question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

input_test = tokenizer(question, text, return_tensors="tf")
output = model(**input_test)

answer_start_index = int(tf.math.argmax(output.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(output.end_logits, axis=-1)[0])

predict_answer_tokens = input_test.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

''

In [94]:
input_test

{'input_ids': <tf.Tensor: shape=(1, 16), dtype=int32, numpy=
array([[  101,  2627,  1108,  3104,  1124, 15703,   136,   102,  3104,
         1124, 15703,  1108,   170,  3505, 16797,   102]])>, 'token_type_ids': <tf.Tensor: shape=(1, 16), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])>, 'attention_mask': <tf.Tensor: shape=(1, 16), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>}

In [95]:
output

TFQuestionAnsweringModelOutput(loss=None, start_logits=<tf.Tensor: shape=(1, 16), dtype=float16, numpy=
array([[-0.3826 , -0.02153, -0.414  , -0.1685 ,  0.11456, -0.08124,
         0.0786 , -0.5254 , -0.00936,  0.328  ,  0.1414 , -0.273  ,
        -0.1273 , -0.1489 , -0.2788 , -0.5254 ]], dtype=float16)>, end_logits=<tf.Tensor: shape=(1, 16), dtype=float16, numpy=
array([[ 0.7124 , -0.0422 , -0.367  , -0.1194 ,  0.09766, -0.01522,
         0.1018 , -0.2605 , -0.09937, -0.1198 , -0.2312 , -0.1819 ,
        -0.3196 , -0.1366 ,  0.06235, -0.2605 ]], dtype=float16)>, hidden_states=None, attentions=None)

In [63]:
model.loss

'categorical_crossentropy'

In [75]:
model

<transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering at 0x1e93d01b460>

In [97]:
model.fit(tf_train_dataset, validation_data=tf_val_dataset, epochs=num_train_epochs)

Epoch 1/3

KeyboardInterrupt: 

In [69]:
model.summary()

Model: "tf_bert_for_question_answering"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108891648 
_________________________________________________________________
qa_outputs (Dense)           multiple                  1538      
Total params: 108,893,186
Trainable params: 108,893,186
Non-trainable params: 0
_________________________________________________________________
