In [1]:
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering, DefaultDataCollator, keras_callbacks
import os
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import model_utils, constants
from huggingface_hub import notebook_login
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam

In [2]:
df_train, df_val, df_test = model_utils.load_train_val_test_datasets()

train_dataset, val_dataset, test_dataset = model_utils.convert_dataframes_to_datasets([df_train, df_val, df_test])

  if _pandas_api.is_sparse(col):


In [3]:
model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [4]:
def tokenize_sample(sample, max_tokens=None, padding=False):
    question = sample['question'].strip()
    context = sample['context'].strip()

    return tokenizer(
        question,
        context,
        max_length=max_tokens,
        padding=padding
    )

tokenized_train_dataset = train_dataset.map(tokenize_sample)
tokenized_val_dataset = val_dataset.map(tokenize_sample)
tokenized_test_dataset = test_dataset.map(tokenize_sample)

print('Max number of tokens in tokenized train dataset: ', len(max(tokenized_train_dataset['input_ids'], key=len)))
print('Max number of tokens in tokenized val dataset: ', len(max(tokenized_val_dataset['input_ids'], key=len)))
print('Max number of tokens in tokenized test dataset: ', len(max(tokenized_test_dataset['input_ids'], key=len)))

Map:   0%|          | 0/68716 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/14724 [00:00<?, ? examples/s]

Map:   0%|          | 0/14725 [00:00<?, ? examples/s]

Max number of tokens in tokenized train dataset:  870
Max number of tokens in tokenized val dataset:  866
Max number of tokens in tokenized test dataset:  817


In [5]:
max_length = 384

def filter_samples_below_number_of_tokens(dataset, max_tokens: int):
    indices_to_remove = []

    # Find indices of samples where number of tokens exceeds max number of tokens
    for index, sample in enumerate(dataset):
        tokenized_sample = tokenize_sample(sample)
        if len(tokenized_sample['input_ids']) > max_tokens:
            indices_to_remove.append(index)

    # Keep only samples with number of tokens less or equal than max number of tokens
    dataset_indices = range(len(dataset))
    filtered_dataset = dataset.select(
        index for index in dataset_indices if index not in set(indices_to_remove)
    )

    return filtered_dataset

filtered_train_dataset = filter_samples_below_number_of_tokens(train_dataset, max_tokens=max_length)
filtered_val_dataset = filter_samples_below_number_of_tokens(val_dataset, max_tokens=max_length)
filtered_test_dataset = filter_samples_below_number_of_tokens(test_dataset, max_tokens=max_length)



In [7]:
print('Number of samples in tokenized train dataset before filtering: ', len(train_dataset))
print('Number of samples in tokenized val dataset before filtering: ', len(val_dataset))
print('Number of samples in tokenized test dataset before filtering: ', len(test_dataset))

print('\n---------------\n')

print('Number of samples in tokenized train dataset after filtering: ', len(filtered_train_dataset))
print('Number of samples in tokenized val dataset after filtering: ', len(filtered_val_dataset))
print('Number of samples in tokenized test dataset after filtering: ', len(filtered_test_dataset))

Number of samples in tokenized train dataset before filtering:  68716
Number of samples in tokenized val dataset before filtering:  14724
Number of samples in tokenized test dataset before filtering:  14725

---------------

Number of samples in tokenized train dataset after filtering:  67964
Number of samples in tokenized val dataset after filtering:  14573
Number of samples in tokenized test dataset after filtering:  14552


In [8]:
def preprocess_dataset(dataset):
    questions = [q.strip() for q in dataset['question']]
    contexts = [c.strip() for c in dataset['context']]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        padding='max_length',
        return_offsets_mapping=True
    )

    offset_mapping = inputs.pop('offset_mapping')
    
    answer_start_indices = dataset['answer_start']
    answer_texts = dataset['answer_text']
    start_positions = []
    end_positions = []
    
    for index, offset in enumerate(offset_mapping):
        start_char = answer_start_indices[index]
        end_char = start_char + len(answer_texts[index])
        sequence_ids = inputs.sequence_ids(index)

        # Find the start and end token indices of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

In [9]:
tokenized_train_dataset = filtered_train_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_train_dataset.column_names,
)
tokenized_val_dataset = filtered_val_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_val_dataset.column_names,
)
tokenized_test_dataset = filtered_test_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_test_dataset.column_names,
)

Map:   0%|          | 0/67964 [00:00<?, ? examples/s]

Map:   0%|          | 0/14573 [00:00<?, ? examples/s]

Map:   0%|          | 0/14552 [00:00<?, ? examples/s]

In [10]:
print(f'All tokenized train dataset entries have {max_length} tokens: ',
      all([len(input_ids) == max_length for input_ids in tokenized_train_dataset['input_ids']]))
print(f'All tokenized val dataset entries have {max_length} tokens: ',
      all([len(input_ids) == max_length for input_ids in tokenized_val_dataset['input_ids']]))
print(f'All tokenized test dataset entries have {max_length} tokens: ',
      all([len(input_ids) == max_length for input_ids in tokenized_test_dataset['input_ids']]))

All tokenized train dataset entries have 384 tokens:  True
All tokenized val dataset entries have 384 tokens:  True
All tokenized test dataset entries have 384 tokens:  True


In [22]:
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
data_collator = DefaultDataCollator(return_tensors='tf')

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
tf_train_dataset = tokenized_train_dataset.to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols=['start_positions', 'end_positions'],
    batch_size=8,
    shuffle=False
)

tf_val_dataset = tokenized_val_dataset.to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols=['start_positions', 'end_positions'],
    batch_size=8,
    shuffle=False
)

tf_test_dataset = tokenized_test_dataset.to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols=['start_positions', 'end_positions'],
    batch_size=8,
    shuffle=False
)

In [24]:
# Model parameters
training_number = 1
model_name = "squad-bert"
full_model_name = f'{model_name}-{training_number}'
hub_path = f'{constants.HUB_MODEL_LOCATION}/{full_model_name}'
batch_size = 8

training_name = f"training_{training_number}"
checkpoint_filename_template = "cp-{epoch:04d}.ckpt"
checkpoints_dir = os.path.join(constants.TRAINING_CHECKPOINTS_PATH, model_name, training_name)
figures_dir = f"./../documentation/models/{model_name}/figures"
checkpoint_path = os.path.join(checkpoints_dir, checkpoint_filename_template)

In [14]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
# Callbacks
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, verbose=1, save_weights_only=True)
early_stop_cb = tf.keras.callbacks.EarlyStopping(patience=1)
push_to_hub = keras_callbacks.PushToHubCallback(output_dir=full_model_name, tokenizer=tokenizer)
callbacks = [checkpoint_cb, early_stop_cb, push_to_hub]

e:\STUDIA\IPS\question-answering\extractive-qa\notebooks\squad-bert-1 is already a clone of https://huggingface.co/nlp-polish/squad-bert-1. Make sure you pull the latest changes with `repo.git_pull()`.


In [26]:
num_train_epochs = 10
num_train_steps = len(tf_train_dataset) * num_train_epochs

lr_scheduler = PolynomialDecay(
    initial_learning_rate=2e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

optimizer = Adam(learning_rate=lr_scheduler)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# Compile
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [27]:
model.summary()

Model: "tf_bert_for_question_answering_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 qa_outputs (Dense)          multiple                  1538      
                                                                 
Total params: 108,893,186
Trainable params: 108,893,186
Non-trainable params: 0
_________________________________________________________________


In [28]:
history = model.fit(tf_train_dataset, validation_data=tf_val_dataset, epochs=num_train_epochs, callbacks=callbacks)

Epoch 1/10
Epoch 1: saving model to ./../training_checkpoints\squad-bert\training_1\cp-0001.ckpt
Epoch 2/10
Epoch 2: saving model to ./../training_checkpoints\squad-bert\training_1\cp-0002.ckpt


In [29]:
def save_model(model, model_name, training_number, saved_models_dir, default_model_version):
    saved_model_name = f"{model_name}_{training_number}"
    model.save(os.path.join(saved_models_dir, saved_model_name, default_model_version))

In [30]:
save_model(model, model_name, training_number, saved_models_dir='./../trained_models', default_model_version='1')



INFO:tensorflow:Assets written to: ./../trained_models\squad-bert_1\1\assets


INFO:tensorflow:Assets written to: ./../trained_models\squad-bert_1\1\assets


In [31]:
bert = tf.keras.models.load_model('./../trained_models/squad-bert-1/1/', compile=False)

In [32]:
bert.summary()

Model: "tf_bert_for_question_answering"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 qa_outputs (Dense)          multiple                  1538      
                                                                 
Total params: 108,893,186
Trainable params: 108,893,186
Non-trainable params: 0
_________________________________________________________________


In [33]:
preds = model.predict(tf_test_dataset)



In [35]:
preds

TFQuestionAnsweringModelOutput(loss=None, start_logits=array([[-8.6  , -7.773, -8.06 , ..., -8.82 , -8.82 , -8.83 ],
       [-8.766, -7.664, -8.164, ..., -8.96 , -8.96 , -8.96 ],
       [-8.61 , -7.58 , -7.695, ..., -8.76 , -8.74 , -8.734],
       ...,
       [-7.855, -6.19 , -7.844, ..., -8.86 , -8.87 , -8.86 ],
       [-8.625, -6.613, -8.21 , ..., -8.79 , -8.79 , -8.79 ],
       [-8.63 , -8.03 , -8.49 , ..., -8.88 , -8.88 , -8.875]],
      dtype=float16), end_logits=array([[-9.   , -8.58 , -8.79 , ..., -9.15 , -9.15 , -9.14 ],
       [-8.77 , -8.65 , -8.92 , ..., -9.2  , -9.2  , -9.2  ],
       [-8.91 , -8.38 , -9.055, ..., -9.17 , -9.18 , -9.19 ],
       ...,
       [-8.64 , -7.76 , -8.33 , ..., -9.22 , -9.22 , -9.22 ],
       [-9.086, -6.996, -8.78 , ..., -9.31 , -9.31 , -9.31 ],
       [-9.2  , -8.26 , -8.75 , ..., -9.266, -9.266, -9.27 ]],
      dtype=float16), hidden_states=None, attentions=None)

In [34]:
from transformers import pipeline

# Replace this with your own checkpoint
question_answerer = pipeline("question-answering", model="nlp-polish/squad-bert-1")

context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)

Downloading (…)lve/main/config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading tf_model.h5:   0%|          | 0.00/436M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at nlp-polish/squad-bert-1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

{'score': 0.984375,
 'start': 78,
 'end': 105,
 'answer': 'Jax, PyTorch and TensorFlow'}

In [None]:
model_utils.plot_and_save_fig_from_history(history,
                                           attributes=['accuracy', 'val_accuracy'],
                                           title='Model accuracy',
                                           y_label='Accuracy',
                                           x_label='Epoch',
                                           legend_descriptors=['Train', 'Val'],
                                           figure_dir_path=figures_dir,
                                           figure_filename=f"{training_name}_accuracy.png")

model_utils.plot_and_save_fig_from_history(history,
                                           attributes=['loss', 'val_loss'],
                                           title='Model loss',
                                           y_label='Loss',
                                           x_label='Epoch',
                                           legend_descriptors=['Train', 'Val'],
                                           figure_dir_path=figures_dir,
                                           figure_filename=f"{training_name}_loss.png")

In [None]:
best_model = model_utils.get_best_model_from_checkpoints(model, history,
                                                         checkpoints_dir=checkpoints_dir,
                                                         checkpoint_filename_template=checkpoint_filename_template)

In [None]:
model_utils.save_model(best_model, model_name=model_name, training_number=training_number, saved_models_dir=constants.SAVED_MODEL_LOCATION, default_model_version=constants.DEFAULT_MODEL_VERSION)

In [None]:
# Evaluation
best_model.evaluate(tf_test_dataset, batch_size=batch_size)

In [None]:
class_preds = model_utils.get_class_preds(model, tf_test_dataset)

In [None]:
precision, recall, f1 = model_utils.get_classification_evaluation_metrics(
    class_actual=tokenized_test_dataset['emotions'],
    class_preds=class_preds,
    average='micro'
)

print(f"Precision score: ", precision)
print(f"Recall score: ", recall)
print(f"F1 score: ", f1)

In [None]:
model_utils.print_incorrectly_predicted_texts(texts=raw_dataset['text_pl'],
                                              class_actual=raw_dataset['emotions'],
                                              class_preds=class_preds)