In [None]:
# Transformers installation
! pip install transformers datasets evaluate

In [1]:
# Parameters
model_checkpoint_name = "albert-base-v2"
my_model_name = "albert-finetuned-squad"
num_train_epochs = 3
max_length = 384
stride = 50

In [4]:
# Login to Huggingface
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load SQuAD dataset
from datasets import load_dataset

squad = load_dataset("squad", split="train[:10000]")

In [6]:
# Train and test split
squad = squad.train_test_split(test_size=0.2)

squad["train"][0]

{'id': '56cbf5106d243a140015ee18',
 'title': 'Frédéric_Chopin',
 'context': 'Two Polish friends in Paris were also to play important roles in Chopin\'s life there. His fellow student at the Warsaw Conservatory, Julian Fontana, had originally tried unsuccessfully to establish himself in England; Albert Grzymała, who in Paris became a wealthy financier and society figure, often acted as Chopin\'s adviser and "gradually began to fill the role of elder brother in [his] life." Fontana was to become, in the words of Michałowski and Samson, Chopin\'s "general factotum and copyist".',
 'question': 'Which friend of Frédéric failed to achieve success in England?',
 'answers': {'text': ['Julian Fontana'], 'answer_start': [133]}}

In [None]:
# Tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)

# check if it is fast Tokenizer
tokenizer.is_fast

In [8]:
# Preprocess dataset
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# Map and Tokenize Data
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

In [10]:
# create a data collector
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [11]:
# Set optimizers
from transformers import create_optimizer

batch_size = 16
num_epochs = 2
total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

In [None]:
# Import pretrained model
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint_name)

In [13]:
# validation preprocess
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [14]:
# Initialize dataset
tf_train_set = model.prepare_tf_dataset(
    tokenized_squad["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

'''
tf_validation_set = model.prepare_tf_dataset(
    tokenized_squad["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)
'''

'\ntf_validation_set = model.prepare_tf_dataset(\n    tokenized_squad["test"],\n    shuffle=False,\n    batch_size=16,\n    collate_fn=data_collator,\n)\n'

In [None]:
validation_dataset = squad["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["test"].column_names,
)
len(squad["test"]), len(validation_dataset)

In [16]:
tf_validation_set = model.prepare_tf_dataset(
    validation_dataset,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [17]:
# compile model
import tensorflow as tf

model.compile(optimizer=optimizer)

model.summary()

Model: "tf_albert_for_question_answering"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 albert (TFAlbertMainLayer)  multiple                  11092992  
                                                                 
 qa_outputs (Dense)          multiple                  1538      
                                                                 
Total params: 11094530 (42.32 MB)
Trainable params: 11094530 (42.32 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Huggingface Push callback
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir=my_model_name,
    tokenizer=tokenizer,
)

In [19]:
# Setup timer
import time
train_time = time.time()
print(train_time)

1694274391.0906177


In [20]:
# model training
train_time = time.time()
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])
train_time = time.time() - train_time

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
#predict
predictions = model.predict(tf_validation_set)

In [None]:
import evaluate

metric = evaluate.load("squad")

In [23]:
# evaluate Function
from tqdm.auto import tqdm
import collections
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [None]:
compute_metrics(
    predictions["start_logits"],
    predictions["end_logits"],
    validation_dataset,
    squad["test"],
)

In [25]:
# Inference
question = "what is my name?"
context = "I'm from Sri lanka. My name is Buddhi."

from transformers import pipeline

question_answerer = pipeline("question-answering", model=my_model_name)
question_answerer(question=question, context=context)

All model checkpoint layers were used when initializing TFAlbertForQuestionAnswering.

All the layers of TFAlbertForQuestionAnswering were initialized from the model checkpoint at albert-finetuned-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertForQuestionAnswering for predictions without further training.


{'score': 0.8118010759353638, 'start': 31, 'end': 38, 'answer': 'Buddhi.'}

In [None]:
# Manual Inference
from transformers import AutoTokenizer

inputs = tokenizer(question, context, return_tensors="tf")

from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained("buddhilive/" + my_model_name)
outputs = model(**inputs)

answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

In [27]:
minutes, seconds = divmod(train_time, 60)
hours, minutes = divmod(minutes, 60)
print("%d:%02d:%02d" % (hours, minutes, seconds))

0:33:49
