###  Data Preperation (Importing, Reformatting, Preprocessing)

In [1]:
#from google.colab import drive
#drive.mount('/content/drive/')
import torch
import numpy as np
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
#!pip install matplotlib
#!pip install sentencepiece 
#!pip install protobuf 

In [3]:
import json
import re
#We will use the JSON library instead of pandas to import our dataset
#The reason we will import our dataset in this fashion is because we want to have a higher level of
#Freedom and versitility when processing our elements and modifying them, and because they're JSON
#Files with different data structure compared to csv files.


In [4]:
#This function is responsible of traversing through the entire data set
#It will compile lists of passages, questions, and answers.
#So we can easily use them.

import json

def json_to_dict_converter(path):
    my_dict = list()

    #Open the JSON file with UTF-8 encoding to handle non-ASCII characters, such as Arabic
    with open(path, encoding="utf-8") as f:
        #Read the file and convert it into a list of strings (each line as an element)
        json_list = list(f)

        #Iterate over each JSON string in the list
        for json_str in json_list:
            #Parse the JSON string into a Python dictionary
            result = json.loads(json_str)
            #Append the dictionary to the list
            my_dict.append(result)

    return my_dict

def dict_element_extractor(dict_object, key):
    elements_list = list()

    #Iterate over each dictionary in the list
    for row_number in range(len(dict_object)):
        #Extract the value associated with the given key and append it to the list
        elements_list.append(dict_object[row_number][key])

    return elements_list

def multi_answer_split(passages, questions, answers):
    #Initialize empty lists to store reformatted passages, questions, and answers
    passages_reformatted = list()
    questions_reformatted = list()
    answers_reformatted = list()

    #Iterate over each set of answers
    for row_index in range(len(answers)):
        #Iterate over each individual answer in the current set of answers
        for answer in answers[row_index]:
            #Append the answer to the reformatted answers list
            answers_reformatted.append(answer)
            #Append the corresponding passage and question to their respective lists
            passages_reformatted.append(passages[row_index])
            questions_reformatted.append(questions[row_index])

    #Return the reformatted passages, questions, and answers as separate lists
    return passages_reformatted, questions_reformatted, answers_reformatted

def extract_json_contents(path):
    #Convert the JSON file into a list of dictionaries
    source_dict = json_to_dict_converter(path)
    #Extract the "passage", "question", and "answers" elements from each dictionary
    passages = dict_element_extractor(source_dict, "passage")
    questions = dict_element_extractor(source_dict, "question")
    answers = dict_element_extractor(source_dict, "answers")
    #Split multiple answers into individual rows and return them
    return multi_answer_split(passages, questions, answers)


In [5]:
#Importing, reformatting, and splitting the datasets into lists
train_passages, train_questions, train_answers = extract_json_contents("datasets/qrcd_v1.1_train.jsonl")
val_passages, val_questions, val_answers = extract_json_contents("datasets/qrcd_v1.1_dev.jsonl")
#test_passages, test_questions, test_answers = extract_json_contents("datasets/qrcd_v1.1_test_gold.jsonl")


In [6]:
#At this point all lists should have the same length
print(len(train_passages))
print(len(train_questions))
print(len(train_answers))

861
861
861


In [7]:
print(len(val_passages))
print(len(val_questions))
print(len(val_answers))

128
128
128


In [8]:
#This function normalizes arabic text by removing unncessary diacritics that would negatively affect the model's performance
#We need to remove the ones that would not affect the meaning of the words.
def normalize_arabic(text):
    #Compiling our matches for characters removal: الضمة والفاتحة والكسرة والسكون والشد والمد
    tashkeel_pattern = re.compile(r'[\u064B-\u0652]')
    #Substitutes the said characters with an empty string
    normalized_text = re.sub(tashkeel_pattern, '', text)
    return normalized_text
    
#This function should remove all of the unneeded digits and punctuations from whatever text we send to it
def remove_digits_and_punctuations(text):
    text = re.sub("[0-9]", "", text)
    my_punct = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',',
                '/', ':', ';', '<', '=', '>', '@', '[', '\\', ']', '^', '_',
                '`', '{', '|', '}', '~', '»', '«', '“', '”']
    punct_pattern = re.compile("[" + re.escape("".join(my_punct)) + "]")
    return re.sub(punct_pattern, "", text)

In [9]:
#Now we need to add end indices to the answers in the text, so we have both the beginning and end positions of our extracted answers
def add_end_indices(answers, contexts):

    #Loop through each answer and its corresponding context
    for answer, context in zip(answers, contexts):
        #Extract the exact answer text (golden text) from the answer
        golden_text = answer['text']
        
        #Get the starting index of the answer in the context
        start_index = answer['start_char']
        #Calculate the initial ending index of the answer based on the starting index and the length of the answer text
        end_index = start_index + len(golden_text)
        
        #Ideally, the calculated start and end indices should match the exact position of the golden text in the context
        if context[start_index:end_index] == golden_text:
            #If the substring from start to end matches the golden text, assign the end index
            answer['end_char'] = end_index
        else:
            #If the substring does not match, try adjusting the indices within a small range to account for potential mismatches
            found = False
            #Adjust the indices by searching within a range of -5 to +5 characters around the initial start index
            for offset in range(-5, 6):
                new_start = start_index + offset
                new_end = new_start + len(golden_text)
                #Check if the substring from the adjusted start to end matches the golden text
                if context[new_start:new_end] == golden_text:
                    #If a match is found, update the start and end indices
                    answer['start_char'] = new_start
                    answer['end_char'] = new_end
                    found = True
                    break
            #If the answer is still not found, search the entire context to ensure the answer is found
            if not found:
                #Find the exact position of the golden text within the context
                start_index = context.find(golden_text)
                if start_index != -1:
                    #If found, update the start and end indices accordingly
                    answer['start_char'] = start_index
                    answer['end_char'] = start_index + len(golden_text)
                else:
                    #If the golden text is not found in the context, raise an error (should not happen if assumption is correct)
                    raise ValueError(f"Answer text '{golden_text}' not found in context.")

In [10]:
#This function helps modifying the text of the answers. We can now use the above preprocessing functions without affecting the structure of
#answers dictionaries.
def modify_text_in_answers(answers_list, text_modification_fn):
    #Initialize an empty list to store the modified dictionaries
    modified_answers = []
    
    #Iterate over each dictionary in the input list
    for answer in answers_list:
        #Create a copy of the current dictionary to avoid modifying the original
        new_answer = answer.copy()
        
        #Check if the 'text' key exists in the dictionary
        if 'text' in new_answer:
            #Apply the text modification function to the 'text' key
            new_answer['text'] = text_modification_fn(new_answer['text'])
        #Add the modified dictionary to the new list
        modified_answers.append(new_answer)

    #Return the new list of modified dictionaries
    return modified_answers

#This function applies all of the preprocessing to our datasets
def preprocess_dataset(passages_list, questions_list, answers_list):
    #Normalizing Arabic text in passages, questions, and answers lists
    passages_list_prepro = [normalize_arabic(passage) for passage in passages_list]
    questions_list_prepro = [normalize_arabic(question) for question in questions_list]
    answers_list_prepro = modify_text_in_answers(answers_list, normalize_arabic)

    #Removing digits and punctuations from passages, questions, and answers lists
    passages_list_prepro = [remove_digits_and_punctuations(passage) for passage in passages_list_prepro]
    questions_list_prepro = [remove_digits_and_punctuations(question) for question in questions_list_prepro]
    answers_list_prepro = modify_text_in_answers(answers_list_prepro, remove_digits_and_punctuations)

    #Adding end indices to answers based on the preprocessed passages
    add_end_indices(answers_list_prepro, passages_list_prepro)
    
    #Return the preprocessed passages, questions, and answers lists
    return passages_list_prepro, questions_list_prepro, answers_list_prepro


In [11]:
#Applying all of the preprocessing
train_passages_prepro, train_questions_prepro, train_answers_prepro = preprocess_dataset(train_passages, train_questions, train_answers)
val_passages_prepro, val_questions_prepro, val_answers_prepro = preprocess_dataset(val_passages, val_questions, val_answers)

#Adding end_char postitions for the non-preprocessed datasets, we will need them to compare both the preprocessed and vanilla sets model performances
add_end_indices(train_answers, train_passages)
add_end_indices(val_answers, val_passages)

In [12]:
#This function prints our dataset lists. head_size represents the first n items in each list
def print_set(passages_list, questions_list, answers_list, head_size = 1):
    for i in range(head_size):
        print("Passage ", i,": ", passages_list[i])
        print(" Question ", i,": ", questions_list[i])
        print("Answer ", i,": ", answers_list[i])
        print("----------------------------------------------------")

In [13]:
#Let's compare the preprocessed and vanilla sets now set
print("Before Preprocessing:")
print_set(train_passages, train_questions, train_answers)
print("After Preprocessing:")
print_set(train_passages_prepro, train_questions_prepro, train_answers_prepro)


Before Preprocessing:
Passage  0 :  ومن الناس من يقول آمنا بالله وباليوم الآخر وما هم بمؤمنين. يخادعون الله والذين آمنوا وما يخدعون إلا أنفسهم وما يشعرون. في قلوبهم مرض فزادهم الله مرضا ولهم عذاب أليم بما كانوا يكذبون. وإذا قيل لهم لا تفسدوا في الأرض قالوا إنما نحن مصلحون. ألا إنهم هم المفسدون ولكن لا يشعرون. وإذا قيل لهم آمنوا كما آمن الناس قالوا أنؤمن كما آمن السفهاء ألا إنهم هم السفهاء ولكن لا يعلمون. وإذا لقوا الذين آمنوا قالوا آمنا وإذا خلوا إلى شياطينهم قالوا إنا معكم إنما نحن مستهزئون. الله يستهزئ بهم ويمدهم في طغيانهم يعمهون. أولئك الذين اشتروا الضلالة بالهدى فما ربحت تجارتهم وما كانوا مهتدين.
 Question  0 :  لماذا سيُحاسب ويُعذب الضال يوم القيامة ان كان ""من يضلل الله فما له من هاد"" كما ورد من قوله تعالى في آية 23 و آية 36 من سورة الزمر؟
Answer  0 :  {'text': 'أولئك الذين اشتروا الضلالة بالهدى', 'start_char': 504, 'end_char': 537}
----------------------------------------------------
After Preprocessing:
Passage  0 :  ومن الناس من يقول آمنا بالله وباليوم الآخر وما هم بمؤمنين. 

###  Dataset Analysis

In [14]:
#Check If each list has any form of Empty Text or nulls, used for debugging errors with the dataset
def check_if_nulls_exist(*lists):
    #List to store tuples of (list_index, element_index, key) for dictionaries
    none_indices = []  

    #Iterate through each list
    for list_index, current_list in enumerate(lists):
        if not current_list:  #Handle empty lists
            print(f"List {list_index} is empty.")
            continue
        
        #Check if the current list is a list of strings
        if isinstance(current_list[0], str):
            for element_index, item in enumerate(current_list):
                if item is None:
                    none_indices.append((list_index, element_index, None))
        
        #Check if the current list is a list of dictionaries
        elif isinstance(current_list[0], dict):
            for dict_index, item in enumerate(current_list):
                for key, value in item.items():
                    if value is None:
                        none_indices.append((list_index, dict_index, key))
        
        else:
            print(f"List {list_index} contains unsupported element types.")

    #Print results
    if none_indices:
        print("NoneType values found:")
        for list_index, element_index, key in none_indices:
            if key is None:
                print(f"List {list_index}, Element {element_index}: NoneType value found.")
            else:
                print(f"List {list_index}, Dictionary {element_index}, Key '{key}':", lists[list_index][element_index])
    else:
        print("No NoneType values found in the lists.")


In [15]:
#check check NoneTypes for training set
check_if_nulls_exist(train_passages_prepro, train_questions_prepro, train_answers_prepro)
#check NoneTypes for validation set
check_if_nulls_exist(val_passages_prepro, val_questions_prepro, val_answers_prepro)

No NoneType values found in the lists.
No NoneType values found in the lists.


In [16]:
#This function fixes the input list by converting a list of dictionaries to a list of strings if necessary.
def dict_list_fixer(mylist):
    #Check if the first element of the list is a dictionary
    if type(mylist[0]) is dict:
        #If it is, create a new list containing the values associated with the 'text' key in each dictionary
        return [item.get('text') for item in mylist]
    else:
        #If it's not a dictionary (assumed to be a list of strings), return the list unchanged
        return mylist

In [17]:
#Get Lengths of each Question, Passage, and Answer (Mean, Median, Mode, Maximum for both sets)
def text_length_statistics(text_list):
    #Fix the input list by converting it to a list of strings if it contains dictionaries
    new_text_list = dict_list_fixer(text_list)
    #Create a list of lengths of each text string
    lengths = [len(text) for text in new_text_list]
    #Calculate the minimum length
    min_len = np.min(lengths)
    #Calculate the maximum length
    max_len = np.max(lengths)
    #Calculate the mean (average) length
    mean_len = np.mean(lengths)
    #Calculate the median length
    median_len = np.median(lengths)
    
    #Return the calculated statistics as a tuple
    return min_len, max_len, mean_len, median_len

#Prints out stats
def print_text_length_statistics(text_list):
    min_len, max_len, mean_len, median_len = text_length_statistics(text_list)
    print(f"Minimum Length: {min_len}")
    print(f"Maximum Length: {max_len}")
    print(f"Mean Length: {mean_len}")
    print(f"Median Length: {median_len}")

In [18]:
#Printing statistics for the datasets
print("-----Train Passages Stats-----")
print_text_length_statistics(train_passages_prepro)
print("---- Train Questions Stats----")
print_text_length_statistics(train_questions_prepro)
print("----- Train Answers Stats-----")
print_text_length_statistics(train_answers_prepro)
print("------------------------------\n")
print("----Validation Passages Stats----")
print_text_length_statistics(val_passages_prepro)
print("----Validation Questions Stats----")
print_text_length_statistics(val_questions_prepro)
print("----Validation Answers Stats----")
print_text_length_statistics(val_answers_prepro)
print("------------------------------")

-----Train Passages Stats-----
Minimum Length: 116
Maximum Length: 1537
Mean Length: 429.00929152148666
Median Length: 387.0
---- Train Questions Stats----
Minimum Length: 11
Maximum Length: 122
Mean Length: 35.0
Median Length: 35.0
----- Train Answers Stats-----
Minimum Length: 3
Maximum Length: 1136
Mean Length: 43.062717770034844
Median Length: 27.0
------------------------------

----Validation Passages Stats----
Minimum Length: 131
Maximum Length: 863
Mean Length: 441.3671875
Median Length: 439.0
----Validation Questions Stats----
Minimum Length: 14
Maximum Length: 53
Mean Length: 27.890625
Median Length: 25.0
----Validation Answers Stats----
Minimum Length: 5
Maximum Length: 319
Mean Length: 54.78125
Median Length: 43.0
------------------------------


In [19]:
import matplotlib.pyplot as plt
#Plot histogram of text lengths
def plot_text_length_histogram(text_list, title="Text Length Histogram"):
    new_text_list = dict_list_fixer(text_list)
    lengths = [len(text) for text in new_text_list]
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, edgecolor='black')
    plt.title(title)
    plt.xlabel('Length (characters)')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

KeyboardInterrupt: 

In [None]:
plot_text_length_histogram(train_passages_prepro, "Train Passages Length Histogram")
plot_text_length_histogram(train_questions_prepro, "Train Questions Length Histogram")
plot_text_length_histogram(train_answers_prepro, "Train Answers Length Histogram")

In [None]:
plot_text_length_histogram(val_passages_prepro, "Validation Passages Length Histogram")
plot_text_length_histogram(val_questions_prepro, "Validation Questions Length Histogram")
plot_text_length_histogram(val_answers_prepro, "Validation Answers Length Histogram")

In [None]:
#----Possible Wordcloud Visualization Here (Not Required)------------

###  Text Tokenization/Encoding

In [None]:
from transformers import AutoConfig
#This function will help us determine the model type, which will come handy in selecting the appropriat tokenizer/model selection function
def get_model_type(model_path):

    #Load the model configuration
    config = AutoConfig.from_pretrained(model_path)
    
    #Return the model type
    return config.model_type

In [None]:
#Importing our tokenizer methods
from transformers import AutoTokenizer, BertTokenizerFast, GPT2TokenizerFast, T5TokenizerFast, PreTrainedTokenizerFast

#This function returns the appropriate tokenizer given the model's path
def get_model_tokenizer(model_path):
    model_type = get_model_type(model_path)
    if model_type == 'bert':
        return BertTokenizerFast.from_pretrained(model_path, model_max_length=512)
    elif model_type == 't5':
        return T5TokenizerFast.from_pretrained(model_path, model_max_length=512)
    elif model_type == 'gpt2':
        return GPT2TokenizerFast.from_pretrained(model_path, model_max_length=512)
    else:
        return AutoTokenizer.from_pretrained(model_path, model_max_length=512)

In [None]:
def get_encodings(tokenizer, passages, questions):
    return tokenizer(passages, questions, truncation=True, padding=True)

In [None]:
#we will define and import AraBert V2 as our tokenizer for testing purposes
model_path = 'aubmindlab/bert-base-arabertv2'
tokenizer = get_model_tokenizer(model_path)

#Now we need to create our encodings using the tokenizer we just initialized
#What this will do is to actually merge those two strings together. So what we will have is our passage/context then a [SEP] token and then the question tokens
#And this will be fed to AraBERT during training
train_prepro_encodings = get_encodings(tokenizer,train_passages_prepro, train_questions_prepro)
val_prepro_encodings = get_encodings(tokenizer,val_passages_prepro, val_questions_prepro)


In [None]:
#Now that our data has been converted to encoding objects, let's check them
print(train_prepro_encodings.keys())
print("input_ids: " , train_prepro_encodings['input_ids'][0])
print("input_tokens: " , tokenizer.decode(train_prepro_encodings['input_ids'][0]))
print("attention_mask: ", train_prepro_encodings['attention_mask'][0])
len(train_prepro_encodings['input_ids'])

In [None]:
#Next, we need to add start and end token positions to our encodings, because we don't have them in there yet
def add_token_positions(encodings, answers):
    #Initialize empty lists to store the start and end positions
    start_positions = []
    end_positions = []

    #Iterate over each answer
    for i in range(len(answers)):
        #Convert the character start position to a token start position and append to the list
        start_positions.append(encodings.char_to_token(i, answers[i]['start_char']))
        #Convert the character end position to a token end position and append to the list
        end_positions.append(encodings.char_to_token(i, answers[i]['end_char']))

        #If the start position is not found, then set it to the model's maximum length
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        #If the end position is not found, then set it to the previous character's token position
        if end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['end_char'] - 1)

    #Update the encodings with the calculated start and end positions
    encodings.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })


In [None]:
#Add token positions to training, validation, and test encodings
add_token_positions(train_prepro_encodings, train_answers_prepro)
add_token_positions(val_prepro_encodings, val_answers_prepro)


In [None]:
print(train_prepro_encodings.keys())
print("start_position: " , train_prepro_encodings['start_positions'][0])
print("end_position: ", train_prepro_encodings['end_positions'][0])

In [None]:
#This function Check for NoneType values in a tokenizer encodings object
def check_for_none_encodings(encodings):
    none_indices = []
    #Get the length of each list within the encodings dictionary
    lengths = {key: len(value) for key, value in encodings.items()}

    #Find the minimum length among all lists to prevent index out of range errors
    min_length = min(lengths.values())

    #Iterate over each key in the encodings dictionary
    for key in encodings:
        #Ensure we're not accessing indices beyond the length of the shortest list
        for idx in range(min_length):
            if encodings[key][idx] is None:
                none_indices.append((idx, key))
    
    #Print out the indices where NoneType values are found, if any
    if none_indices:
        print("NoneType values found in encodings:")
        for idx, key in none_indices:
            print(f"Index {idx}, Key '{key}': {encodings[key][idx]}")
    else:
        print("No NoneType values found in encodings.")

In [None]:
#Now let's check that our encodings are clean and have no NoneType values in order to make sure that they don't clash with the training & evaluation procedures
check_for_none_encodings(train_prepro_encodings)
check_for_none_encodings(val_prepro_encodings)

In [None]:
#Okay, so our data is in the right format at the moment. Now let's create a PyTorch dataset object using it!
import torch

#We then define the dataset using a class
class QuranDataset(torch.utils.data.Dataset):
    #This is an initializing function, similar to constructors in Java and C++
    def __init__(self, encodings):
        #Store the encodings as an instance variable
        self.encodings = encodings

    #This function allows the dataset to be indexed
    def __getitem__(self, idx):
        #Return a dictionary where each key-value pair is converted to a tensor
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    #This function returns the length of the dataset
    def __len__(self):
        #Return the number of input_ids in the encodings
        return len(self.encodings['input_ids'])


In [None]:
#So we apply this to our encodings to create datasets objects
train_prepro_dataset = QuranDataset(train_prepro_encodings)
val_prepro_dataset = QuranDataset(val_prepro_encodings)

In [None]:
#This function checks for NoneType values in a PyTorch dataset object, We need to do that in order to make sure that our dataset implementation is valid
#This will save us alot of headache during model training errors debugging
def check_for_none_pytorch_dataset(dataset):
    #Initialize an empty list to store indices and keys where NoneType values are found
    none_indices = []  
    
    #Loop through each index in the dataset
    for idx in range(len(dataset)):
        item = dataset[idx]  #Retrieve the item (a dictionary) at the current index
        #Loop through each key-value pair in the dictionary
        for key, value in item.items():
            #Check if the value is None
            if value is None:
                #If the value is None, append a tuple of the index and the key to the none_indices list
                none_indices.append((idx, key))
    
    #Check if there are any NoneType values recorded in none_indices
    if none_indices:
        #If NoneType values are found, print a message indicating this
        print("NoneType values found in PyTorch dataset:")
        #Loop through the list of indices and keys where None values were found
        for idx, key in none_indices:
            #Print the index, key, and the item at the index to show the context
            print(f"Index {idx}, Key '{key}':", dataset[idx])
    else:
        #If no NoneType values are found, print a message indicating this
        print("No NoneType values found in PyTorch dataset.")


In [None]:
#Check if the datasets objects  has any form of NoneTypes
check_for_none_pytorch_dataset(train_prepro_dataset)
check_for_none_pytorch_dataset(val_prepro_dataset)

In [None]:
#Finally, We integrate the entire process into a single handy & easy to use function
#This function converts input data into a PyTorch dataset format suitable for model training or evaluation
def dataset_to_pytorch_format(passages_list, questions_list, answers_list, model_path='aubmindlab/bert-base-arabertv2'):
    
    #Get the tokenizer for the specified model
    tokenizer = get_model_tokenizer(model_path)
    if get_model_type(model_path) == 'gpt2':
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    #Tokenize passages and questions to get encodings
    encodings = get_encodings(tokenizer, passages_list, questions_list)
    #Add token positions to the encodings based on the answers
    add_token_positions(encodings, answers_list)
    #Create a PyTorch dataset object from the encodings
    my_pytorch_dataset_object = QuranDataset(encodings)
    #Return the PyTorch dataset object
    return my_pytorch_dataset_object, tokenizer


###  Fine-Tuning

In [None]:
#Initializing the model to be used. We will use a pretrained model known as AraBERT
from transformers import BertForQuestionAnswering, T5ForQuestionAnswering, GPT2ForQuestionAnswering, AutoModelForQuestionAnswering

def get_model(model_path):
    model_type = get_model_type(model_path)
    if model_type == 'bert':
        return BertForQuestionAnswering.from_pretrained(model_path)
    elif model_type == 't5':
        return T5ForQuestionAnswering.from_pretrained(model_path)
    elif model_type == 'gpt2':
        return GPT2ForQuestionAnswering.from_pretrained(model_path)
    else:
        return AutoModelForQuestionAnswering.from_pretrained(model_path)

In [None]:
#Now we have to load all of that into a dataloader object
from torch.utils.data import DataLoader
#We use Adam optimizer with weight decay to reduce the chances of over-fitting.
from transformers import AdamW
#This is the progress bar library
from tqdm import tqdm

In [None]:
def training_loop( model, optimizer, device, train_loader, epochs):
    #Loop for training the model for a total of four epochs
    for epoch in range(epochs):
        #Initialize the progress bar with the training data loader
        progress_bar = tqdm(train_loader)
        
        #Iterate over each batch of data from the training data loader
        for batch in progress_bar:
            #Reset the gradients from the previous iteration to zero
            optimizer.zero_grad()
    
            #Move input data and labels to the specified device (CPU/GPU)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
    
            #Forward pass: compute the model's output for the given inputs
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)
            
            #Extract the loss from the model's output (assuming the model returns the loss as the first element)
            loss = outputs[0]
            
            #Backward pass: compute the gradients of the loss with respect to the model parameters
            loss.backward()
            
            #Update the model parameters using the optimizer
            optimizer.step()
    
            #Update the progress bar with the current epoch number and the current loss
            progress_bar.set_description(f'Epoch {epoch}')
            progress_bar.set_postfix(loss=loss.item())
            
    return model

In [None]:
def train_model(model_path, train_dataset, batch_size=2, learning_rate=2e-5, epochs=3):
    model = get_model(model_path)
    #Setting our target device for training and other stuff. GPU is the default.
    torch.cuda.empty_cache()
    #Transferring model to our target hardware device
    model.to(device)
    #Selecting train mode for the model
    model.train()
    #Initialize the optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    #Now, let's initialize our dataloader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    training_loop(model, optimizer, device, train_loader, epochs=epochs)
    #returns the model after being trained
    return model

#### Grid Search Custom implementation

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

#Custom class inheriting from BaseEstimator and ClassifierMixin for integration with scikit-learn
class CustomTransformer(BaseEstimator, ClassifierMixin):
    def __init__(self, model_path, batch_size=2, learning_rate=2e-5, epochs=3):
        self.model_path = model_path  #Path to the pre-trained model
        self.batch_size = batch_size  #Batch size for training
        self.learning_rate = learning_rate  #Learning rate for the optimizer
        self.epochs = epochs  #Number of training epochs
        self.model = None  #Placeholder for the model

    #Method to train the model
    def fit(self, X, y=None):
        #Prepare the dataset and dataloader
        train_dataset = self.prepare_dataset(X, y)
        self.model = get_model(self.model_path)  #Load the pre-trained model
        
        #Setting up the device (GPU if available, else CPU)
        torch.cuda.empty_cache()  #Clear the cache
        self.model.to(device)  #Move the model to the device
        self.model.train()  #Set the model to training mode
        
        #Initialize the optimizer
        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        
        #Initialize the dataloader
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        
        #Training loop
        self.training_loop(self.model, optimizer, device, train_loader, self.epochs)
        return self
    
    #Method to score the model (dummy implementation)
    def score(self, X, y=None):
        #Implement your evaluation logic here
        #For demonstration, we'll return a dummy score
        return 1.0

    #Method to prepare the dataset (placeholder implementation)
    def prepare_dataset(self, X, y):
        #Prepare and return your dataset
        return X
        
    #Training loop implementation
    def training_loop(self, model, optimizer, device, train_loader, epochs):
        for epoch in range(epochs):
            progress_bar = tqdm(train_loader)  #Initialize the progress bar
            for batch in progress_bar:
                optimizer.zero_grad()  #Reset gradients from the previous iteration
                #Move batch data to the device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)
                #Forward pass: compute model output
                outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
                loss = outputs[0]  #Extract the loss
                loss.backward()  #Backward pass: compute gradients
                optimizer.step()  #Update model parameters
                #Update progress bar description and loss
                progress_bar.set_description(f'Epoch {epoch}')
                progress_bar.set_postfix(loss=loss.item())
        return model

#### Model Evaluation & Accuracy Metrics Functions

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#Function to calculate various metrics
def calculate_metrics(start_preds, end_preds, start_positions, end_positions):
    #Convert tensors to numpy arrays and move them to CPU
    start_preds = start_preds.cpu().numpy()
    end_preds = end_preds.cpu().numpy()
    start_positions = start_positions.cpu().numpy()
    end_positions = end_positions.cpu().numpy()

    #Ensure arrays are at least 1D and flatten them
    start_preds = np.atleast_1d(start_preds.flatten())
    end_preds = np.atleast_1d(end_preds.flatten())
    start_positions = np.atleast_1d(start_positions.flatten())
    end_positions = np.atleast_1d(end_positions.flatten())

    #Calculate start position metrics
    accuracy_start = accuracy_score(start_positions, start_preds)
    precision_start = precision_score(start_positions, start_preds, average='macro')
    recall_start = recall_score(start_positions, start_preds, average='macro')
    f1_start = f1_score(start_positions, start_preds, average='macro')

    #Calculate end position metrics
    accuracy_end = accuracy_score(end_positions, end_preds)
    precision_end = precision_score(end_positions, end_preds, average='macro')
    recall_end = recall_score(end_positions, end_preds, average='macro')
    f1_end = f1_score(end_positions, end_preds, average='macro')

    #Calculate the average of start and end position metrics
    accuracy = (accuracy_start + accuracy_end) / 2
    precision = (precision_start + precision_end) / 2
    recall = (recall_start + recall_end) / 2
    f1 = (f1_start + f1_end) / 2
    
    #Function to calculate Mean Reciprocal Rank (MRR)
    def reciprocal_rank(start_pred, start_pos):
        ranks = np.where(start_pos == start_pred)[0]
        return 1 / (ranks[0] + 1) if ranks.size > 0 else 0

    #Calculate MRR
    mrr = np.mean([reciprocal_rank(sp, spos) for sp, spos in zip(start_preds, start_positions)])

    #Function to calculate Precision at Rank (pRR)
    def precision_at_rank(start_pred, start_pos):
        return 1 if start_pred == start_pos else 0

    #Calculate pRR
    prr = np.mean([precision_at_rank(sp, spos) for sp, spos in zip(start_preds, start_positions)])

    #Return all calculated metrics
    return accuracy, precision, recall, f1, mrr, prr


In [None]:
from torch.utils.data import DataLoader

def evaluate(model, val_dataset, batch_size, device):
    #Create a DataLoader for the validation dataset
    validation_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    #Set the model to evaluation mode
    model.eval()
    #Move the model to the specified device (CPU or GPU)
    model.to(device)
    
    #Initialize variables to store total loss and predictions
    total_loss = 0
    all_start_preds = []
    all_end_preds = []
    all_start_positions = []
    all_end_positions = []

    #Disable gradient calculation for evaluation
    with torch.no_grad():
        #Iterate over the validation data in batches
        for batch in validation_loader:
            #Move input data and labels to the specified device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            #Forward pass through the model to get predictions and loss
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)

            #Get the loss from the model's output and accumulate it
            loss = outputs[0]
            total_loss += loss.item()

            #Get the start and end logits from the model's output
            start_logits, end_logits = outputs[1], outputs[2]
            #Convert logits to predicted positions by taking the argmax
            start_preds = torch.argmax(start_logits, dim=1)
            end_preds = torch.argmax(end_logits, dim=1)

            #Store the predictions and actual positions
            all_start_preds.append(start_preds)
            all_end_preds.append(end_preds)
            all_start_positions.append(start_positions)
            all_end_positions.append(end_positions)

    #Calculate the average loss over all validation batches
    avg_loss = total_loss / len(validation_loader)
    
    #Calculate evaluation metrics using the predictions and actual positions
    accuracy, precision, recall, f1, mrr, prr = calculate_metrics(
        torch.cat(all_start_preds),  #Concatenate all start predictions
        torch.cat(all_end_preds),    #Concatenate all end predictions
        torch.cat(all_start_positions),  #Concatenate all actual start positions
        torch.cat(all_end_positions)     #Concatenate all actual end positions
    )

    #Return the average loss and evaluation metrics
    return avg_loss, accuracy, precision, recall, f1, mrr, prr


### AraBERT V2 Training, Evaluation, & Hyperparameter optimization

#### Model Training Without Preprocessing Applied

In [None]:

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

model_path = 'aubmindlab/bert-base-arabertv2'
#first let's compare the non-preprocessed(vanilla) dataset vs the preprocessed datasets on AraBert V2
train_dataset_vanilla, tokenizer = dataset_to_pytorch_format(train_passages,train_questions, train_answers, model_path)
#Training using those default parameters
trained_model_vanilla = train_model(model_path, train_dataset_vanilla, 2, 2e-5, 5)
model_path = 'model/araBert-quranQA-v2-vanilla'
trained_model_vanilla.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

#### Model Training With Preprocessing Applied

In [None]:
model_path = 'aubmindlab/bert-base-arabertv2'

train_dataset_prepro, tokenizer = dataset_to_pytorch_format(train_passages_prepro,train_questions_prepro, train_answers_prepro, model_path)
#Training using those default parameters
trained_model_prepro = train_model(model_path, train_dataset_prepro, 2, 2e-5, 5)
model_path = 'model/araBert-quranQA-v2-preprocessed'
trained_model_prepro.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

#### Evaluating both models on their respective evaluation sets to see their performances

In [None]:
model_path = 'model/araBert-quranQA-v2-vanilla'
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

val_dataset_vanilla, temp = dataset_to_pytorch_format(val_passages, val_questions, val_answers, model_path)

#Evaluate the model on the validation set
val_loss, val_accuracy, val_precision, val_recall, val_f1, val_mrr, val_prr = evaluate(model, val_dataset_vanilla, batch_size=2, device=device)
print("--------------Arabert V2 Trained on Vanilla QRCD Dataset --------------")
print(f'Validation Loss: {val_loss}')
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation Precision: {val_precision}')
print(f'Validation Recall: {val_recall}')
print(f'Validation F1 Score: {val_f1}')
print(f'Validation MRR: {val_mrr}')
print(f'Validation pRR: {val_prr}')

In [None]:
model_path = 'model/araBert-quranQA-v2-preprocessed/'
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

val_dataset_prepro, temp = dataset_to_pytorch_format(val_passages_prepro, val_questions_prepro, val_answers_prepro, model_path)

#Evaluate the model on the validation set
val_prepro_loss, val_prepro_accuracy, val_prepro_precision, val_prepro_recall, val_prepro_f1, val_prepro_mrr, val_prepro_prr = evaluate(model, val_dataset_prepro, batch_size=2, device=device)
print("--------------Arabert V2 Trained on Preprocessed QRCD Dataset --------------")
print(f'Validation Loss: {val_prepro_loss}')
print(f'Validation Accuracy: {val_prepro_accuracy}')
print(f'Validation Precision: {val_prepro_precision}')
print(f'Validation Recall: {val_prepro_recall}')
print(f'Validation F1 Score: {val_prepro_f1}')
print(f'Validation MRR: {val_prepro_mrr}')
print(f'Validation pRR: {val_prepro_prr}')

### MarBert

In [None]:
model_path = 'UBC-NLP/MARBERT'

torch.cuda.empty_cache()
train_dataset, tokenizer = dataset_to_pytorch_format(train_passages,train_questions, train_answers, model_path)
#Training using those default parameters
model = train_model(model_path, train_dataset, 2, 2e-5, 5)
model_path = 'model/MARBERT-quranQA'



def make_contiguous(state_dict):
    for key in state_dict:
        if not state_dict[key].is_contiguous():
            state_dict[key] = state_dict[key].contiguous()
    return state_dict

# Assume `model` is your trained BERT model
model_path = 'model/MARBERT-quranQA'

# Make all tensors in the state_dict contiguous
state_dict = model.state_dict()
state_dict = make_contiguous(state_dict)

# Save the modified state_dict
model.save_pretrained(model_path, state_dict=state_dict)

# Save the tokenizer if needed
tokenizer.save_pretrained(model_path)

In [None]:
model_path = 'model/MARBERT-quranQA'
model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

val_dataset, temp = dataset_to_pytorch_format(val_passages, val_questions, val_answers, model_path)

#Evaluate the model on the validation set
val_loss, val_accuracy, val_precision, val_recall, val_f1, val_mrr, val_prr = evaluate(model, val_dataset, batch_size=2, device=device)
print("--------------MARBERT Trained on Vanilla QRCD Dataset --------------")
print(f'Validation Loss: {val_loss}')
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation Precision: {val_precision}')
print(f'Validation Recall: {val_recall}')
print(f'Validation F1 Score: {val_f1}')
print(f'Validation MRR: {val_mrr}')
print(f'Validation pRR: {val_prr}')

#### Testing Grid Search on AraBert V2

In [None]:
#Define the parameter grid for Grid Search
#This dictionary specifies the hyperparameters to be tested during the grid search
#'batch_size' will be tested with values 2, 4, and 6
#'learning_rate' will be tested with values 2e-5, 1e-4, and 2e-4
#'epochs' will be tested with values 2, 5, and 10
torch.cuda.empty_cache()
param_grid = {
    'batch_size': [2, 4, 6],
    'learning_rate': [2e-5, 1e-4, 2e-4],
    'epochs': [2, 5, 10]
}

#Define the path to the pre-trained model
model_path = 'aubmindlab/bert-base-arabertv2'

#Convert the training dataset (passages, questions, answers) into PyTorch format
#This function also returns the tokenizer (unused here)
train_dataset, tokenizer = dataset_to_pytorch_format(train_passages, train_questions, train_answers, model_path)

#Initialize GridSearchCV with the custom transformer model and the parameter grid
#GridSearchCV will perform an exhaustive search over the specified parameter grid
#It evaluates the model using cross-validation to find the best combination of hyperparameters
grid_search = GridSearchCV(CustomTransformer(model_path), param_grid)

#Fit GridSearchCV to the training dataset
#This process will train the model with all combinations of hyperparameters from the parameter grid
grid_search.fit(train_dataset)

#Print the best hyperparameters found during the grid search
print("Best parameters found: ", grid_search.best_params_)

#Print the best score achieved with the best hyperparameters
print("Best score: ", grid_search.best_score_)


#### Testing Our Pipeline on another model

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
#Since I don't have enough Vram, then let's use the validation set as training samples number by in order to see if the pipeline can work with other models
torch.cuda.empty_cache()
model_path = 'UBC-NLP/AraT5-base'
val_dataset_vanilla, tokenizer = dataset_to_pytorch_format(val_passages, val_questions, val_answers, model_path)
trained_model_vanilla = train_model(model_path, val_dataset_vanilla, 2, 2e-5, 5)

### Predicting An Answer Using a QRCD Trained Model

In [None]:
#Let's predict an answer using a context, question pair of our own
#Load the fine-tuned model and tokenizer
model_path = 'model/araBert-quranQA-v0.5/'
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForQuestionAnswering.from_pretrained(model_path)

#Ensure the model is in evaluation mode
model.eval()

#Define the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

#Function to predict the answer from context and question
def predict_answer(context, question):
    #Tokenize the input context and question
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    
    #Move inputs to the specified device
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    #Perform a forward pass to get the model's output for the given inputs
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    #Get the predicted start and end positions
    start_logits = outputs['start_logits']
    end_logits = outputs['end_logits']
    
    start_idx = torch.argmax(start_logits, dim=1).item()
    end_idx = torch.argmax(end_logits, dim=1).item()
    
    #Decode the predicted answer from the input ids
    answer_ids = input_ids[0][start_idx:end_idx+1]
    answer = tokenizer.decode(answer_ids, skip_special_tokens=True)
    
    return answer

#Example usage
context = input("ضع النص القرأني هنا:")
question = input("ضع سؤالك هنا:")
predicted_answer = predict_answer(context, question)
print(f"Predicted Answer: {predicted_answer}")


### Miscellaneous

In [None]:
#Graphing the result
#Extract results
results = grid_search.cv_results_

#Create a plot of the grid search results
learning_rates = param_grid['learning_rate']
mean_scores = results['mean_test_score']
std_scores = results['std_test_score']

plt.figure(figsize=(10, 6))
plt.errorbar(learning_rates, mean_scores, yerr=std_scores, fmt='o', capsize=5)
plt.xlabel('Learning Rate')
plt.ylabel('Mean Test Score')
plt.title('Grid Search Results')
plt.xscale('log')
plt.show()

In [None]:
#Word Embeddings Extraction
import torch
from transformers import BertTokenizerFast, BertModel

#Load pre-trained model and tokenizer
model_name = 'model/araBert-quranQA-v2-vanilla'  #You can change this to 'aubmindlab/bert-base-arabertv2' or any other model
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

#Ensure the model is in evaluation mode and moved to the appropriate device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.eval()

def generate_embeddings(texts, tokenizer, model, device):
    embeddings = []
    for text in texts:
        #Tokenize input text and convert to PyTorch tensors
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        #Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            #Get the embeddings for the [CLS] token (assuming BERT or similar model)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())
    
    return embeddings

#Generate embeddings for each list
passage_embeddings = generate_embeddings(train_passages, tokenizer, model, device)
question_embeddings = generate_embeddings(train_questions, tokenizer, model, device)
answer_embeddings = generate_embeddings([answer['text'] for answer in train_answers], tokenizer, model, device)

#Print the shape of the embeddings
print(f"Passage Embeddings Shape: {len(passage_embeddings)}, {passage_embeddings[0].shape}")
print(f"Question Embeddings Shape: {len(question_embeddings)}, {question_embeddings[0].shape}")
print(f"Answer Embeddings Shape: {len(answer_embeddings)}, {answer_embeddings[0].shape}")


### Graphical User Interface

In [1]:
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering

def get_answer_from_model(model_path, passage, question):
    tokenizer = BertTokenizerFast.from_pretrained(model_path)
    model = BertForQuestionAnswering.from_pretrained(model_path)
    #Ensure the model is in evaluation mode
    model.eval()
    
    #Define the device
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    #Tokenize the input context and question
    inputs = tokenizer.encode_plus(question, passage, return_tensors='pt')
    
    #Move inputs to the specified device
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    #Perform a forward pass to get the model's output for the given inputs
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    #Get the predicted start and end positions
    start_logits = outputs['start_logits']
    end_logits = outputs['end_logits']
    
    start_idx = torch.argmax(start_logits, dim=1).item()
    end_idx = torch.argmax(end_logits, dim=1).item()
    
    #Decode the predicted answer from the input ids
    answer_ids = input_ids[0][start_idx:end_idx+1]
    answer = tokenizer.decode(answer_ids, skip_special_tokens=True)
    return answer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from tkinter import Tk, Canvas, Entry, Button, PhotoImage, StringVar, OptionMenu, Text, Scrollbar, Frame

model_name = ""

def open_popup(passage, question):
    global popup, text_widget
    
    window.destroy()
    
    # Create a new window
    popup = Tk()
    popup.title("Popup Window")
    popup.geometry("640x480")
    
    # Create a StringVar for the dropdown menu
    selected_option = StringVar(popup)
    selected_option.set("AraBERT V0.2")  # Set default option
    
    # Create a dropdown menu
    options = ["AraBERT V0.2", "AraBERT V2 (Buggy)", "MARBERT"]
    dropdown_menu = OptionMenu(popup, selected_option, *options, command=option_selected)
    dropdown_menu.pack(pady=10)
    
    global model_name  
    model_name = 'model/araBert-quranQA-v0.2'
    
    # Create a frame to hold the Text widget and scrollbar
    text_frame = Frame(popup)
    text_frame.pack(pady=10, padx=10, fill='both', expand=True)
    
    # Create a Text widget for multi-line text display
    text_widget = Text(text_frame, wrap='word', bg="#D9D9D9", fg="#000716", bd=0, height=15)
    text_widget.config(state='disabled')  # Make the Text widget read-only
    
    # Create a vertical scrollbar
    scrollbar = Scrollbar(text_frame, command=text_widget.yview)
    text_widget.config(yscrollcommand=scrollbar.set)
    
    # Pack the Text widget and scrollbar
    text_widget.pack(side='left', fill='both', expand=True)
    scrollbar.pack(side='right', fill='y')
    
    # Create a button that displays text
    display_button = Button(popup, text="Display Text", command=lambda: display_text(popup, get_answer_from_model(model_name, passage, question)))
    display_button.pack(pady=10)

    # Define on_closing function
    def on_closing():
        popup.destroy()
        view_main_window()
    
    popup.protocol("WM_DELETE_WINDOW", on_closing)

def option_selected(option):
    global model_name  
    if option == "AraBERT V0.2":
        model_name = 'model/araBert-quranQA-v0.2'
    elif option == "AraBERT V2 (Buggy)":
        model_name = 'model/araBert-quranQA-v2-vanilla'
    elif option == "MARBERT":
        model_name = 'model/MARBERT-quranQA'

def display_text(popup, text):
    global text_widget
    
    # Clear the existing text in the Text widget
    text_widget.config(state='normal')  # Allow editing
    text_widget.delete('1.0', 'end')  # Clear all text
    
    # Insert the new text
    text_widget.insert('1.0', text)
    text_widget.config(state='disabled')  # Make the Text widget read-only


In [3]:

from pathlib import Path
import os

# from tkinter import *
# Explicit imports to satisfy Flake8


# Determine the root directory
if '__file__' in globals():
    ROOT_DIR = Path(__file__).parent
else:
    # Fallback to current working directory
    ROOT_DIR = Path(os.getcwd())

# Define the assets directory relative to the root directory
ASSETS_PATH = ROOT_DIR / Path("assets/frame0")

def relative_to_assets(path: str) -> Path:
    return ASSETS_PATH / Path(path)

def view_main_window():
    global window
    window = Tk()

    window.geometry("632x455")
    window.configure(bg="#FFFFFF")
    
    canvas = Canvas(
        window,
        bg="#FFFFFF",
        height=455,
        width=632,
        bd=0,
        highlightthickness=0,
        relief="ridge"
    )
    canvas.place(x=0, y=0)
    
    canvas.create_rectangle(
        0.0,
        0.0,
        632.0,
        455.0,
        fill="#FFFFFF",
        outline=""
    )
    
    image_image_1 = PhotoImage(file=relative_to_assets("image_1.png"))
    image_1 = canvas.create_image(
        316.0,
        36.0,
        image=image_image_1
    )
    
    canvas.create_text(
        0.0,
        20.0,
        anchor="nw",
        text="نظام سؤال وجواب في كتاب اللَّه الكريم",
        fill="#000000",
        font=("AnonymousPro Regular", 20 * -1)
    )
    
    entry_image_1 = PhotoImage(file=relative_to_assets("entry_1.png"))
    entry_bg_1 = canvas.create_image(
        316.0,
        223.5,
        image=entry_image_1
    )
    entry_1 = Text(
        window,
        bd=0,
        bg="#D9D9D9",
        fg="#000716",
        highlightthickness=0
    )
    entry_1.place(
        x=43.0,
        y=127.0,
        width=546.0,
        height=191.0
    )
    
    canvas.create_text(
        33.0,
        93.0,
        anchor="nw",
        text="قم بادخال النص القرآني هنا",
        fill="#000000",
        font=("AnonymousPro Regular", 20 * -1)
    )
    
    canvas.create_text(
        33.0,
        333.0,
        anchor="nw",
        text="قم بادخال السؤال هنا",
        fill="#000000",
        font=("AnonymousPro Regular", 20 * -1)
    )
    
    entry_image_2 = PhotoImage(file=relative_to_assets("entry_2.png"))
    entry_bg_2 = canvas.create_image(
        316.0,
        379.5,
        image=entry_image_2
    )
    entry_2 = Entry(
        window,
        bd=0,
        bg="#D9D9D9",
        fg="#000716",
        highlightthickness=0
    )
    entry_2.place(
        x=43.0,
        y=367.0,
        width=546.0,
        height=23.0
    )
    
    button_image_1 = PhotoImage(file=relative_to_assets("button_1.png"))
    button_1 = Button(
        window,
        image=button_image_1,
        borderwidth=0,
        highlightthickness=0,
        command=lambda: open_popup(entry_1.get("1.0", "end-1c"), entry_2.get()),
        relief="flat"
    )
    button_1.place(
        x=282.0,
        y=401.0,
        width=68.0,
        height=40.0
    )
    
    window.resizable(False, False)
    window.mainloop()

In [4]:
view_main_window()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
