In [1]:
# cell-1  

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy



pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)


print('done')

2023-10-25 19:16:28.932576: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-25 19:16:28.955659: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


done


In [2]:
from datasets import load_dataset

# raw_datasets = load_dataset("arcd")
# raw_datasets = load_dataset("squad")
# raw_datasets = load_dataset("xtreme", 'XQuAD.ar')
# raw_datasets = load_dataset("xtreme", 'MLQA.ar.ar')
raw_datasets = load_dataset("xtreme", 'XQuAD.ar', split='validation')

raw_datasets

Found cached dataset xtreme (/home/ffq/.cache/huggingface/datasets/xtreme/XQuAD.ar/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 1190
})

In [3]:

#only select records with Arabic language (specified in the id field)
# raw_datasets = raw_datasets.filter(lambda example: example['id'].startswith('ara'))
raw_datasets = raw_datasets.train_test_split(test_size=0.2, seed=42)
raw_datasets

Loading cached split indices for dataset at /home/ffq/.cache/huggingface/datasets/xtreme/XQuAD.ar/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-4b14d912f96bb64a.arrow and /home/ffq/.cache/huggingface/datasets/xtreme/XQuAD.ar/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-67c125626f38cdb0.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 952
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 238
    })
})

In [4]:
#check if the 'answer' col contains more than 1 answer

raw_datasets["train"].filter(lambda x: len(x["answers"]["text"]) != 1)

Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/xtreme/XQuAD.ar/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-d30df25de8357d3e.arrow


Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 0
})

In [5]:
#cell-3
#loading the tokenizer and the model

from transformers import AutoTokenizer, DefaultDataCollator, AutoModelForQuestionAnswering

# url = 'bert-base-uncased'
# url = 'UBC-NLP/MARBERTv2'
# url = 'faisalq/bert-base-arabic-wordpiece'
url = 'faisalq/bert-base-arabic-senpiece'
# url = 'faisalq/bert-base-arabic-bbpe'

tokenizer = AutoTokenizer.from_pretrained(url)
model = AutoModelForQuestionAnswering.from_pretrained(url).to('cuda')                                                                                                      

# dc = DefaultDataCollator()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at faisalq/bert-base-arabic-senpiece and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
max_length = 128
stride = 50


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs




In [7]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/xtreme/XQuAD.ar/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-56cd451ab9c58c75.arrow


(952, 2451)

In [8]:
validation_dataset = raw_datasets["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["test"].column_names,
)
len(raw_datasets["test"]), len(validation_dataset)

Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/xtreme/XQuAD.ar/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-95e45aa406955e04.arrow


(238, 620)

In [9]:
import evaluate
import collections

metric = evaluate.load("squad")

In [10]:
from tqdm.auto import tqdm
n_best = 20
max_answer_length = 50


def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [11]:
from transformers import Trainer, TrainingArguments


epochs = 40
save_steps = 10000 #save checkpoint every 10000 steps
batch_size = 256

training_args = TrainingArguments(
    output_dir = 'bert_wb5/',
    overwrite_output_dir=True,
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    save_steps = save_steps,
    save_total_limit = 2, 
    fp16=True,
    learning_rate = 5e-5,  # 5e-5 is the default
    # weight_decay=0.01,
    logging_steps = 10, #50_000
    evaluation_strategy = 'epoch',
    # evaluate_during_training = True,
    eval_steps = 10
    
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,    
    tokenizer=tokenizer
)


# trainer.train(resume_from_checkpoint=True)
trainer.train()

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,3.3024,No log
2,2.4646,No log
3,2.214,No log
4,1.93,No log
5,1.6544,No log
6,1.3683,No log
7,1.1555,No log
8,0.9381,No log
9,0.7418,No log
10,0.6253,No log


TrainOutput(global_step=400, training_loss=0.5108076468110084, metrics={'train_runtime': 128.1868, 'train_samples_per_second': 764.822, 'train_steps_per_second': 3.12, 'total_flos': 6404383507599360.0, 'train_loss': 0.5108076468110084, 'epoch': 40.0})

In [12]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["test"])

  0%|          | 0/238 [00:00<?, ?it/s]

{'exact_match': 19.327731092436974, 'f1': 28.23141635289973}

In [13]:
# {'exact_match': 11.344537815126051, 'f1': 17.084801015252573} 10e

In [14]:
# {'exact_match': 18.48739495798319, 'f1': 26.329945160401206} 20e

In [15]:
# {'exact_match': 15.966386554621849, 'f1': 26.44316786458218} 30e