In [1]:
# cell-1  

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy



pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)


print('done')

2023-10-24 08:17:48.070992: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-24 08:17:48.094515: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


done


In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("arcd")
# raw_datasets = load_dataset("squad")
# raw_datasets = load_dataset("xtreme", 'XQuAD.ar')
# raw_datasets = load_dataset("xtreme", 'MLQA.ar.ar')

raw_datasets

Found cached dataset arcd (/home/ffq/.cache/huggingface/datasets/arcd/plain_text/1.0.0/6a55997214ab7c60a599e029849f8a3b7864c9d402c89242c14caafc1bde6692)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 693
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 702
    })
})

In [3]:
#check if the 'answer' col contains more than 1 answer

raw_datasets["train"].filter(lambda x: len(x["answers"]["text"]) != 1)

Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/arcd/plain_text/1.0.0/6a55997214ab7c60a599e029849f8a3b7864c9d402c89242c14caafc1bde6692/cache-d919d6ea2e18d236.arrow


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 0
})

In [4]:
#cell-3
#loading the tokenizer and the model

from transformers import AutoTokenizer, DefaultDataCollator, AutoModelForQuestionAnswering


# url = 'faisalq/bert-base-arabic-wordpiece'
url = 'faisalq/bert-base-arabic-senpiece'
# url = 'faisalq/bert-base-arabic-bbpe'

tokenizer = AutoTokenizer.from_pretrained(url)
model = AutoModelForQuestionAnswering.from_pretrained(url).to('cuda')                                                                                                      

# dc = DefaultDataCollator()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at faisalq/bert-base-arabic-senpiece and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
max_length = 128
stride = 50


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs




In [6]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/arcd/plain_text/1.0.0/6a55997214ab7c60a599e029849f8a3b7864c9d402c89242c14caafc1bde6692/cache-88c060f5d94dcb89.arrow


(693, 1306)

In [7]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

Loading cached processed dataset at /home/ffq/.cache/huggingface/datasets/arcd/plain_text/1.0.0/6a55997214ab7c60a599e029849f8a3b7864c9d402c89242c14caafc1bde6692/cache-abf45fa60c13204a.arrow


(702, 1418)

In [8]:
import evaluate
import collections

metric = evaluate.load("squad")

In [9]:
from tqdm.auto import tqdm
n_best = 20
max_answer_length = 50


def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [10]:
from transformers import Trainer, TrainingArguments


epochs = 120
save_steps = 10000 #save checkpoint every 10000 steps
batch_size = 256

training_args = TrainingArguments(
    output_dir = 'bert_bbpe5/',
    overwrite_output_dir=True,
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    save_steps = save_steps,
    save_total_limit = 2, 
    fp16=True,
    learning_rate = 5e-5,  # 5e-5 is the default
    # weight_decay=0.01,
    logging_steps = 10, #50_000
    evaluation_strategy = 'epoch',
    # evaluate_during_training = True,
    eval_steps = 10
    
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,    
    tokenizer=tokenizer
)


# trainer.train(resume_from_checkpoint=True)
trainer.train()

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,No log
2,3.562100,No log
3,3.562100,No log
4,2.684800,No log
5,2.310000,No log
6,2.310000,No log
7,1.858600,No log
8,1.858600,No log
9,1.388200,No log
10,0.991500,No log


TrainOutput(global_step=720, training_loss=0.24284978954561262, metrics={'train_runtime': 250.8656, 'train_samples_per_second': 624.717, 'train_steps_per_second': 2.87, 'total_flos': 1.023760692891648e+16, 'train_loss': 0.24284978954561262, 'epoch': 120.0})

In [11]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["validation"])

  0%|          | 0/702 [00:00<?, ?it/s]

{'exact_match': 20.797720797720796, 'f1': 47.664214491147604}

In [12]:
# {'exact_match': 20.512820512820515, 'f1': 46.58983856203568} 80e

In [13]:
# {'exact_match': 18.803418803418804, 'f1': 45.95516584266742} 100e

In [14]:
# {'exact_match': 19.373219373219374, 'f1': 46.5498056417331} 120e

In [15]:
# {'exact_match': 20.655270655270655, 'f1': 47.182808442234005} 140e