# Basic SetUp

In [1]:
from transformers import pipeline
from transformers import RobertaTokenizerFast
roberta_tokenizer = RobertaTokenizerFast.from_pretrained('./tokenizer')

arabic_letters = ['أ','ة','إ','ؤ','آ','ا','ب','ت', 'ث','ج','ح','خ','د','ذ','ر','ز','س','ش','ص','ض','ط','ظ','ع','غ','ف','ق','ك','ل','م','ن','ه','و','ي','ئ','ئ','ء']
arabic_diac = ["َ","ً","ِ","ٍ","ُ","ٌ","ْ","َّ","ِّ","ُّ"]

# Adding the tokens by hand
roberta_tokenizer.add_tokens(arabic_diac)
roberta_tokenizer.add_tokens(" ")
roberta_tokenizer.add_tokens(arabic_letters)

# Creating the fill_mask pipeline 
# When you call it like this fill_mask(انا ا<maks>) it fills the mask
fill_mask = pipeline(
    "fill-mask",
    model="./model/",
    tokenizer=roberta_tokenizer
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## The diacritize method already removes the diacritics from any sentence but this is just to show you what it does
The passes is the number of times it goes through the model, we found that 3 passes is usually the sweet spot for the data.

In [2]:
from utils import diacritize,_remove_diac

original = 'وَأَجَازَ أَكْثَرُهُمْ الفَرْقَ'
undiacritized = _remove_diac(original,isDataset=False)['text']
print(undiacritized)
prediction= diacritize(undiacritized,fill_mask,passes=3,isDataset=False)
print(prediction)

وأجاز أكثرهم الفرق
وَأَجَاَزَ أَكْثَرَهُمْ اَلْفَرْقَ


In [3]:
diac = [x for x in original if x in ["َ", "ً", "ِ", "ٍ", "ُ", "ٌ", "ْ", "َّ", "ِّ", "ُّ", "ّ"]]

In [4]:
diac

['َ', 'َ', 'َ', 'َ', 'َ', 'ْ', 'َ', 'ُ', 'ُ', 'ْ', 'َ', 'ْ', 'َ']

In [5]:
from pathlib import Path
paths_test = [str(x) for x in Path("./data/tashkeela_test/").glob("*.txt")]

In [6]:
from datasets import load_dataset

eval_dataset = load_dataset('text',data_files=paths_test, split='train')

Using custom data configuration default-78fcaec21dbfc73e
Reusing dataset text (/home/peter/.cache/huggingface/datasets/text/default-78fcaec21dbfc73e/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


In [12]:
def get_diacs(sentence):
    diacs=[]
    chars = [char for char in sentence]
    for i in range(len(chars)):
        if chars[i] in arabic_letters:
            if chars[i+1] in arabic_diac:
                diacs += [chars[i+1]]
            else:
                diacs += ['']
    return diacs

In [53]:
def diac_error_rate(original):
    l1= get_diacs(original)
    prediction = diacritize(original,fill_mask,passes=3,isDataset=False)
    l2= get_diacs(prediction)
    length = len(l1)
    summation = 0
    for i in range(length):
        if l1[i]=='':
            length-=1
        else:
            if l1[i]==l2[i]:
                summation+=1
    return summation,length,summation/length

In [None]:
from tqdm.auto import tqdm
iterations = 250
progress_bar = tqdm(range(iterations))

max_summation = 0
max_length = 0
max_acc = 0
for i in range(iterations):
    if len(eval_dataset['text'][i])<150:
        summ , l , acc = diac_error_rate(eval_dataset['text'][i]) 
    max_summation+=summ
    max_length+= l
    max_acc += acc
    progress_bar.update(1)

  0%|          | 0/250 [00:00<?, ?it/s]