In [1]:
import pandas as pd
import tqdm
import json
import random
import torch
import os
from collections import Counter 
import datasets
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import StepLR
from torch.optim import Adam
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from transformers import AutoModelForQuestionAnswering, AutoTokenizer,Trainer,TrainingArguments
from sklearn.metrics import f1_score
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
max_length = 256 # The maximum length of a feature (question and context)
doc_stride = 64 # The authorized overlap between two part of the context when splitting it is needed.
lr = 3e-5

epoch = 30
batch_size = 4
model = "C:/Users/LEGION/OneDrive - University Of Jordan/Dalalat/QA/QA_Model/"

In [3]:
def f1_score(prediction_tokensIds, ground_truth_tokensIds):
    common = Counter(prediction_tokensIds) & Counter(ground_truth_tokensIds)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokensIds)
    recall = 1.0 * num_same / len(ground_truth_tokensIds)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1
def pAP_score(mScores, ranks, gold_spans_set):
    """ Computing partial average precision """
    score = 0.0
    partialHits = 0.0
    for mScore, rank in zip(mScores, ranks):
        if mScore != 0:
            partialHits = partialHits + mScore
            score += partialHits / rank
    return score / len(gold_spans_set) # pAP

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
ar_tokenizer = AutoTokenizer.from_pretrained(model)
ar_model = AutoModelForQuestionAnswering.from_pretrained(model).to(device)

In [20]:
def prepare_train_features(examples):
    global ar_tokenizer
    tokenized_examples = ar_tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True)
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    
    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(ar_tokenizer.cls_token_id)
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [15]:
def create_dataset(train_passage_question_objects):
    datasets_ = []
    for passage_question_object in train_passage_question_objects:
        for r in passage_question_object["answers"]:
            # print(r)
            ans = dict({'answer_start': [r["start_char"]], 'text': [r["text"]]})
            datasets_.append(
                dict(
                {"id": passage_question_object["pq_id"],
                "context": passage_question_object["passage"],
                "question":passage_question_object["question"],
                "answers": ans
                    }))

    
    datasets_ = pd.DataFrame(datasets_)
    train_dataset = datasets.Dataset.from_dict(datasets_)
    return train_dataset

In [10]:
def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

class Answer():
    def __init__(self,dictionary) -> None:
        self.text = dictionary["text"]
        self.start_char = dictionary["start_char"]

    def to_dict(self) -> dict:
        answer_dict = {
        "text":self.text,
        "start_char":self.start_char
        }
        return answer_dict

class PassageQuestion():
    def __init__(self,dictionary) -> None:
        self.pq_id = None
        self.passage = None
        self.surah = None
        self.verses = None
        self.question = None
        self.answers = []
        self.pq_id = dictionary["pq_id"]
        self.passage = dictionary["passage"]
        self.surah = dictionary["surah"]
        self.verses = dictionary["verses"]
        self.question = dictionary["question"]
        for answer in dictionary["answers"]:
            self.answers.append(Answer(answer))

    def to_dict(self) -> dict:
        passge_question_dict = {
        "pq_id":self.pq_id,
        "passage":self.passage,
        "surah":self.surah,
        "verses":self.verses,
        "question":self.question,
        "answers":[x.to_dict() for x in self.answers]
        }
        return passge_question_dict

def read_JSONL_file(file_path) -> list:
    data_in_file = load_jsonl(file_path)

    # get list of PassageQuestion objects
    passage_question_objects = []
    for passage_question_dict in data_in_file:
        # instantiate a PassageQuestion object
        pq_object = PassageQuestion(passage_question_dict)
        print (f"pq_id: {pq_object.pq_id}")
        passage_question_objects.append(pq_object)

    print(f"Collected {len(passage_question_objects)} Object from {file_path}")
    return passage_question_objects

def write_to_JSONL_file(passage_question_objects,output_path) -> None:

    # list of dictionaries for the passage_question_objects
    dict_data_list = []
    for pq_object in passage_question_objects:
        dict_data = pq_object.to_dict()
        dict_data_list.append(dict_data)
    dump_jsonl(dict_data_list,output_path)


In [8]:
def train_QA():
    print(device)
    print(model)
    train_set_file = "../QA/TaskB Data/QQA23_TaskB_qrcd_v1.2_train_preprocessed.jsonl"
    dev_set_file = "../QA/TaskB Data/QQA23_TaskB_qrcd_v1.2_dev_preprocessed.jsonl"

    train_passage_question_objects  = load_jsonl(train_set_file)
    dev_passage_question_objects = load_jsonl(dev_set_file)

    train_dataset = create_dataset(train_passage_question_objects)
    dev_dataset = create_dataset(dev_passage_question_objects)
    my_dataset_dict = datasets.DatasetDict({"train":train_dataset, "dev" : dev_dataset})
    tokenized_ds = my_dataset_dict.map(prepare_train_features, batched=True, remove_columns=my_dataset_dict["train"].column_names)

    args = TrainingArguments(
        f"result",
        evaluation_strategy = "steps",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epoch,
        weight_decay=0.0001,
        save_strategy = "steps",
        save_steps=1500,
        )


    trainer = Trainer(
    model=ar_model,
    args=args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['dev'],
    tokenizer=ar_tokenizer)

    # start training
    trainer.train()
    
    model_path = "/kaggle/working/"
    trainer.save_model(model_path)
    
    return ar_tokenizer, ar_model

In [9]:
train_QA()

cuda:0
ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA
Loaded 992 records from /kaggle/input/taskb-data/QQA23_TaskB_qrcd_v1.2_train_preprocessed.jsonl
Loaded 163 records from /kaggle/input/taskb-data/TaskB Data/QQA23_TaskB_qrcd_v1.2_dev_preprocessed.jsonl


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,1.6126,2.452021
1000,0.8349,3.48151
1500,0.6467,3.977406
2000,0.5351,4.208948
2500,0.4807,4.633619
3000,0.4375,4.846795
3500,0.4219,4.844057
4000,0.3631,5.582001
4500,0.3914,5.571537
5000,0.3242,5.555631


(ElectraTokenizerFast(name_or_path='ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA', vocab_size=64000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
 	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 },
 ElectraForQuestionAnswering(
   (electra): ElectraModel(
     (embeddings): ElectraEmbeddings(
      

In [41]:
train_set_file = "../QA/TaskB Data/QQA23_TaskB_qrcd_v1.2_train_preprocessed.jsonl"


train_passage_question_objects  = load_jsonl(train_set_file)
train_dataset = create_dataset(train_passage_question_objects)

Loaded 992 records from ../QA/TaskB Data/QQA23_TaskB_qrcd_v1.2_train_preprocessed.jsonl


In [58]:
train_passage_question_objects[0]

{'pq_id': '2:1-5_372',
 'passage': 'الم . ذلك الكتاب لا ريب فيه هدى للمتقين . الذين يؤمنون بالغيب ويقيمون الصلاة ومما رزقناهم ينفقون . والذين يؤمنون بما أنزل إليك وما أنزل من قبلك وبالآخرة هم يوقنون . أولئك على هدى من ربهم وأولئك هم المفلحون .',
 'surah': 2,
 'verses': '1-5',
 'question': 'ما الدلائل على أن القرآن ليس من تأليف سيدنا محمد ( ص ) ؟',
 'answers': [{'text': 'الذين يؤمنون بما أنزل إليك وما أنزل من قبلك',
   'start_char': 100}]}

In [60]:
a=ar_tokenizer.encode_plus(
    train_passage_question_objects[0]['question'],
    train_passage_question_objects[0]['passage'],
    max_length=256,
    padding='max_length',
    return_tensors="pt",
    add_special_tokens=True,
    return_attention_mask=True
)

In [64]:
a.to('cuda')

{'input_ids': tensor([[    2,   394, 34855,   323,   331,  4544,  1117,   306,  8315, 30404,
           582,    14,   126,    15,   105,     3,   301,    20,   563,  2886,
           391, 16072,   903, 13576,  6897,  3945,    20,   860, 28174,  4779,
           739, 27418,   319,  7938, 13626,  9754, 27147, 27207,   319,    20,
          6767, 28174,  1199, 34767, 20007,  1177, 34767,   306,   600,   209,
          2609,  4475,   197,  1891, 55118,  4817,    20, 10331,   323, 13576,
           306, 14756,   201, 53990,  1891,  2327,   182,  7653,    20,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [66]:
preds=ar_model(**a)

In [68]:
squad_labels = preds.label_ids

AttributeError: 'QuestionAnsweringModelOutput' object has no attribute 'label_ids'