# <font color='darkorange'> **Fine-tuning monolingual base models for extractive Question Answering in Catalan, Basque and French** </font>

In this notebook we will fine-tune monolingual base models for extractive Question Answering in Catalan, Basque and French using Hugging Face transformers.

In [None]:
# Install Hugging Face transformers and datasets.
!pip install transformers datasets huggingface_hub

In [None]:
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
import transformers
# Before continuing, make sure version of transformers is at least 4.16. 
print(transformers.__version__)

In [None]:
language = "ca" # For Basque "eu", for Catalan "ca", and for French "fr".

In [None]:
if language == "ca":
  model_checkpoint = "projecte-aina/roberta-base-ca-v2"
elif language == "eu":
  model_checkpoint = "ixa-ehu/berteus-base-cased"
else:
  model_checkpoint = "camembert-base"

## **1. Loading the dataset**

In [None]:
# Use datasets library to load the dataset and get the metric we need for evaluation.
from datasets import load_dataset, load_metric

In [None]:
# Connect to Google Drive to load the dataset if needed.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset, concatenate_datasets, DatasetDict
from sklearn.model_selection import train_test_split

if language == "ca":
  path = "/data/ca_viquiquad" # Change if needed.
  script = path+"/ca_script.py" 
  dataset = {"train": path+"/train.json",
            "dev": path+"/dev.json", 
            "test": path+"/test.json"}
  raw_dataset = load_dataset(script, data_files=dataset)

elif language == "eu":
  path = "/data/eu_elkarhizketak" # Change if needed.
  script = path+"/eu_script.py"
  dataset = {"train": path+"/train.json",
            "dev": path+"/dev.json" ,  
            "test": path+"/test.json"}
  raw_dataset = load_dataset(script, data_files=dataset)

else:
  path = "/data/fr_fquad" # Change if needed.
  script = path+"/fr_script.py"  
  dataset = {"train": path+"/train.json", 
            "dev": path+"/dev.json"} # No test file is provided, so we buid the dataset with only the train and dev files.
  raw_dataset = load_dataset(script, data_files=dataset)

  # Combine all available QA pairs.
  dataset_concatenated = concatenate_datasets([raw_dataset['train'], raw_dataset['validation']])
  dataset_concatenated

  # Get a slice of 80 % for training and 20 % for test and validation.
  train_testvalidation = dataset_concatenated.train_test_split(test_size=0.20, seed=42)
  # Get a slice of 10 % for test and 10 % validation.
  test_validation = train_testvalidation['test'].train_test_split(test_size=0.5, seed=42)
  # Gather all the sections into a single DatasetDict.
  raw_dataset = DatasetDict({
      'train': train_testvalidation['train'],
      'test': test_validation['test'],
      'validation': test_validation['train']})

In [None]:
# Show dataset format.
raw_dataset

In [None]:
# Show a QA sample.
print(raw_dataset["train"][0])

## **2. Preprocessing the dataset**



In [None]:
# Instantiate tokenizer with the AutoTokenizer.from_pretrained method 
#   to get the tokenizer corresponding to the model architecture and 
#   to download the vocabulary used when pretraining it.
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# The tokenizer will tokenize the inputs and convert the tokens to their corresponding IDs in the pretrained vocabulary.

In [None]:
max_length = 384  # Maximum length of a feature (question and context).
# Examples longer than max_length will be splitted into several input features.
# The context of each of these features will overlap a bit the context of the previous feature
# (just in case the answer lies at the point where the context is splitted).
doc_stride = 128  # Allowed overlap between the features when splitting is performed.

In [None]:
def prepare_train_features(examples):
    # Tokenize examples with truncation and padding, but keep the overflows using a stride.
    tokenized_examples = tokenizer(examples["question"],
                                   examples["context"],
                                   truncation="only_second", # Only truncate context (not question).
                                   max_length=max_length,
                                   stride=doc_stride, 
                                   return_overflowing_tokens=True,
                                   return_offsets_mapping=True, # Map to find start and end positions of the answers in the tokens.
                                   padding="max_length")
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # Map from a feature to its corresponding example.
    offset_mapping = tokenized_examples.pop("offset_mapping") # Map from token to character position in the original context.
    
    # Label examples:
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        # Grab the sequence corresponding to that example (to know what is the context and what is the question):
        #   returns None for special tokens, 
        #   and 0 or 1 depending on whether the corresponding token comes from the first sentence (question) or the second (context).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Index of the example containing this span of text (as one example can give several spans).
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        start_char = answers[0]["answer_start"]
        end_char = answers[0]["answer_start"] + len(answers[0]["text"])

        # Start token index of the current span in the text.
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
          token_start_index += 1

        # End token index of the current span in the text.
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
          token_end_index -= 1

        # Detect if the answer is out of the span. If so, the label is (0,0).
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(0)
                tokenized_examples["end_positions"].append(0)
        # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
        else:
            while (token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char):
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
# Apply function to dataset.
tokenized_dataset = raw_dataset.map(prepare_train_features, 
                                    batched=True, 
                                    remove_columns=raw_dataset["train"].column_names)

## **3. Fine-tuning the model**

In [None]:
# Download pretrained model.
# Since our task is QA, we use the TFAutoModelForQuestionAnswering class. 
from transformers import TFAutoModelForQuestionAnswering
# Like with the tokenizer, from_pretrained method will download and cache the model.
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint, from_pt=True)

In [None]:
# Set hyperparameters.
batch_size = 16
learning_rate = 5e-5 
num_train_epochs = 5

In [None]:
from transformers import create_optimizer
import tensorflow as tf
import gc

def training_model(training_example, tokenized_dataset, model):
    if training_example < 1:
      train_size = training_example * int(len(tokenized_dataset["train"]))
      train_size = int(train_size)
      test_size = int(len(tokenized_dataset["train"])) - train_size
      downsampled_dataset = tokenized_dataset["train"].train_test_split(train_size=train_size, test_size=test_size, seed=42)
      train_set = model.prepare_tf_dataset(downsampled_dataset["train"],
                                      shuffle=True,
                                      batch_size=batch_size)
      
    else:
      train_set = model.prepare_tf_dataset(tokenized_dataset["train"],
                                      shuffle=True,
                                      batch_size=batch_size)

    
    dev_set = model.prepare_tf_dataset(tokenized_dataset["validation"],
                                    shuffle=False,
                                    batch_size=batch_size)
    
    # Create optimizer and specify loss function. # AdamW optimizer.
    total_train_steps = len(train_set)*num_train_epochs # Compute total number of training steps.
    optimizer, schedule = create_optimizer(init_lr=learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps)

    # Compile the model.
    model.compile(optimizer=optimizer, 
                  jit_compile=True, 
                  metrics=["accuracy"])

    # Fine-tune the model.
    if training_example == 1:
      print(f"Training with a {int(training_example*100)}% of the examples.")
    else:
      print(f"Training with {int(training_example*100)}% of the examples.")
    model.fit(train_set, validation_data=dev_set, epochs=num_train_epochs)

    model.save_pretrained(f"/results/{language}/{training_example}_model") # Change if needed.
    tf.keras.backend.clear_session()

    # Clean GPU memory
    # Eliminate variables that we are going to define again to avoid overloading the GPU memory
    del train_set, dev_set, optimizer, schedule 
    gc.collect()

In [None]:
training_examples = [0.01, 0.05, 0.1]
for training_ex in training_examples:
  training_model(training_ex, tokenized_dataset, model)

In [None]:
training_examples = [0.25, 0.5, 0.75]
for training_ex in training_examples:
  training_model(training_ex, tokenized_dataset, model)

In [None]:
training_ex = 1
training_model(training_ex, tokenized_dataset, model)

## **4. Evaluation**

#### **4.1. Preprocessing the test set**

Answers are classified using the score obtained by adding the start and end logits. The best indices in the start and end logits are selected (according to the hyperparameter `n_best_size`). The answers this predicts are then checked one by one and sorted by their score to keep the best one. 

To check if a given span is inside the context (and not the question) and to get back the text inside, in the test features we keep (1) the ID of the example that generated the feature (as one example can generate several features), and (2) the offset mapping (to map from token indices to character positions in the context). For this reason, the test set is preprocessed with the function `prepare_test_features`.


In [None]:
def prepare_test_features(examples):
    # Tokenize examples with truncation and maybe padding, but keep the overflows using a stride.
    tokenized_examples = tokenizer(examples["question"],
                                   examples["context"],
                                   truncation="only_second", # Only truncate context (not question).
                                   max_length=max_length,
                                   stride=doc_stride,
                                   return_overflowing_tokens=True, 
                                   return_offsets_mapping=True, # Map to find start and end positions of the answers in the tokens.
                                   padding="max_length")
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # Map from a feature to its corresponding example.
    
    tokenized_examples["example_id"] = [] # Keep the example_id of the feature and store the offset mappings.

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        # Index of the example containing this span of text (as one example can give several spans).
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context 
        #   to determine if a token position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [(o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])]

    return tokenized_examples

In [None]:
# Apply function to test set. 
test_features = raw_dataset["test"].map(prepare_test_features,
                                        batched=True,
                                        remove_columns=raw_dataset["test"].column_names)

In [None]:
# Convert test set into a tf.data.Dataset.
test_set = model.prepare_tf_dataset(test_features,
                                    shuffle=False,
                                    batch_size=batch_size)

#### **4.2. Making predictions and processing them**

In [None]:
# Predictions for all features.
raw_predictions = model.predict(test_set)

In [None]:
import numpy as np
import collections
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples,
                               features,
                               all_start_logits,
                               all_end_logits,
                               n_best_size=20, # Best indices in start and end logits.
                               max_answer_length=30, # Eliminate longer answers.
                               ):
    
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Loop over all examples.
    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index] # Indices of the features associated to the current example.
        min_null_score = None
        valid_answers = []
        context = example["context"]
        
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # Grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # To be able to map some the positions in our logits to span of texts in the original context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Out-of-scope answers are not considered, 
                    #   either because the indices are out of bounds 
                    #   or because they correspond to part of the input_ids that are not in the context.
                    if (start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or not offset_mapping[start_index]
                        or not offset_mapping[end_index]):
                        continue
                    # Answers with a length that is either < 0 or > max_answer_length are not considered.
                    if (end_index < start_index
                        or end_index - start_index + 1 > max_answer_length):
                        continue
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {"score": start_logits[start_index] + end_logits[end_index],
                         "text": context[start_char:end_char]})

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # Fake prediction to avoid failure if there is not a single non-null prediction.
            best_answer = {"text": "", "score": 0.0}

        # Select final answer: the best one or the null answer.
        answer = (best_answer["text"] if best_answer["score"] > min_null_score else "")
        predictions[example["id"]] = answer

    return predictions

In [None]:
# Apply post-processing function to raw predictions. 
final_predictions = postprocess_qa_predictions(raw_dataset["test"],
                                               test_features,
                                               raw_predictions["start_logits"],
                                               raw_predictions["end_logits"])

#### **4.3. Computing the metrics**

In [None]:
# Load metric from the datasets library.
metric = load_metric("squad_v2")

In [None]:
# Format predictions and labels as a list of dictionaries.
formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0}
                         for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} 
              for ex in raw_dataset["test"]]

In [None]:
# Compute the metric.
metric.compute(predictions=formatted_predictions, 
               references=references)

#### **4.4 Computing the metrics for all models**

In [None]:
from transformers import AutoModel
import pandas as pd

list_pd = []
training_examples = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1]

# Register the custom loss function
for training_example in training_examples:
  # Load the model
  loaded_model = model.load_weights(f"/results/{language}/{training_example}_model/tf_model.h5") # Change if needed.
  raw_predictions = model.predict(test_set)
  final_predictions = postprocess_qa_predictions(raw_dataset["test"],
                                               test_features,
                                               raw_predictions["start_logits"],
                                               raw_predictions["end_logits"])
  formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0}
                         for k, v in final_predictions.items()]
  references = [{"id": ex["id"], "answers": ex["answers"]} 
                for ex in raw_dataset["test"]]
  results = metric.compute(predictions=formatted_predictions, 
               references=references)
  df = pd.DataFrame.from_dict(results, orient='index', columns=[f'{training_example}'])
  list_pd.append(df)

In [None]:
# Save the results.
merged_df = pd.concat(list_pd, axis=1)
csv_path = f"/results/{language}/{language}_results.csv" # Change if needed.
merged_df.to_csv(csv_path, index=True)
display(merged_df)