In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.tokenize import word_tokenize
import nltk
import pyarabic.araby as araby
from pyarabic.araby import strip_tashkeel
import qalsadi.lemmatizer 
import qalsadi.analex as qa
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
from farasa.pos import FarasaPOSTagger 
from farasa.ner import FarasaNamedEntityRecognizer 
from farasa.diacratizer import FarasaDiacritizer 
from farasa.segmenter import FarasaSegmenter 
from farasa.stemmer import FarasaStemmer

import keras
from  diacritization_evaluation import util




In [2]:
words = []
sentences = []
sentences_with_tashkeel = []
with open('./Dataset/training/train_words_stripped.txt', 'r', encoding='utf-8') as output_file:
    for word in output_file:
        words.append(word.strip())

with open('./Dataset/training/train_stripped.txt', 'r', encoding='utf-8') as output_file:
    for sentence in output_file:
        sentences.append(sentence.strip())

with open('./Dataset/training/train_cleaned.txt', 'r', encoding='utf-8') as output_file:
    for sentence in output_file:
        sentences_with_tashkeel.append(sentence.strip())

dev_words = []
dev_sentences = []
dev_sentences_with_tashkeel = []

with open('./Dataset/val/val_words_stripped.txt', 'r', encoding='utf-8') as output_file:
    for word in output_file:
        dev_words.append(word.strip())

with open('./Dataset/val/val_stripped.txt', 'r', encoding='utf-8') as output_file:
    for sentence in output_file:
        dev_sentences.append(sentence.strip())

with open('./Dataset/val/val_cleaned.txt', 'r', encoding='utf-8') as output_file:
    for sentence in output_file:
        dev_sentences_with_tashkeel.append(sentence.strip())

print(words[0:10])
print(sentences[0])
print(sentences_with_tashkeel[0])


['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']
قوله أو قطع الأول يده إلخ قال الزركشي
قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ


In [3]:
print(sentences_with_tashkeel[0])

قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ


Creating a word-based tokenizer

In [4]:
# Create a tokenizer
words_tokenizer = Tokenizer()

# Fit the tokenizer on the list of words (treat each word as a separate "sentence")
words_tokenizer.fit_on_texts(sentences)

# Get the word index
word_index = words_tokenizer.word_index

# Tokenize the words
word_sequences = words_tokenizer.texts_to_sequences(sentences)

dev_word_sequences = words_tokenizer.texts_to_sequences(dev_sentences)

Testing the tokenizer on a sample sentence

In [5]:
# Create a sentence tokenizer
print((sentences[0:2])) #This way works and the one used below as well
print(word_index["قوله"])
print(words_tokenizer.texts_to_sequences([sentences[0]]))

['قوله أو قطع الأول يده إلخ قال الزركشي', 'ابن عرفة قوله بلفظ يقتضيه كإنكار غير حديث بالإسلام وجوب ما علم وجوبه من الدين ضرورة كإلقاء مصحف بقذر وشد زنار ابن عرفة قول ابن شاس أو بفعل يتضمنه هو كلبس الزنار وإلقاء المصحف في صريح النجاسة والسجود للصنم ونحو ذلك وسحر محمد قول مالك وأصحابه أن الساحر كافر بالله تعالى قال مالك هو كالزنديق إذا عمل السحر بنفسه قتل ولم يستتب .']
7
[[7, 3, 276, 90, 190, 46, 13, 907]]


Creating a character-based with tashkeel tokenizer

In [6]:
sentences_new = []
with open('./Dataset/training/train_replace.txt', 'r', encoding='utf-8') as output_file:
    for sentence in output_file:
        sentences_new.append(sentence.strip())

dev_sentences_replaced = []
with open('./Dataset/val/val_replaced.txt', 'r', encoding='utf-8') as output_file:
    for sentence in output_file:
        dev_sentences_replaced.append(sentence.strip())

char_tokenizer_with_tashkeel = Tokenizer(char_level=True, oov_token='UNK')
char_tokenizer_with_tashkeel.fit_on_texts(sentences_new)
char_index_with_tashkeel = char_tokenizer_with_tashkeel.word_index
char_sequences_with_tashkeel = char_tokenizer_with_tashkeel.texts_to_sequences(sentences_new)
dev_char_sequences_with_tashkeel = char_tokenizer_with_tashkeel.texts_to_sequences(dev_sentences_replaced)
print(char_sequences_with_tashkeel[0:5])
print(char_index_with_tashkeel)

[[20, 2, 11, 5, 6, 8, 13, 8, 3, 18, 2, 11, 5, 3, 20, 2, 40, 2, 16, 2, 3, 7, 6, 5, 18, 2, 11, 17, 6, 8, 3, 10, 2, 22, 2, 13, 8, 3, 28, 6, 2, 33, 5, 3, 20, 2, 7, 6, 2, 3, 7, 6, 41, 17, 15, 5, 23, 2, 36, 4, 10, 42], [7, 14, 5, 12, 8, 3, 16, 2, 15, 2, 19, 2, 26, 2, 3, 20, 2, 11, 5, 6, 8, 13, 8, 3, 14, 4, 6, 2, 19, 5, 47, 31, 3, 10, 2, 20, 5, 21, 2, 37, 4, 10, 13, 3, 23, 2, 28, 4, 12, 5, 23, 2, 7, 15, 4, 3, 43, 2, 10, 5, 15, 4, 3, 24, 2, 22, 4, 10, 34, 31, 3, 14, 4, 7, 6, 5, 28, 4, 25, 5, 6, 2, 7, 9, 4, 3, 11, 8, 27, 8, 11, 14, 2, 3, 9, 2, 7, 3, 16, 8, 6, 4, 9, 2, 3, 11, 8, 27, 8, 11, 14, 8, 13, 8, 3, 9, 4, 12, 5, 3, 7, 6, 22, 35, 10, 12, 4, 3, 37, 2, 15, 8, 11, 15, 2, 26, 38, 3, 23, 2, 28, 4, 6, 5, 20, 2, 7, 45, 4, 3, 9, 8, 30, 5, 24, 2, 19, 31, 3, 14, 4, 20, 2, 29, 2, 15, 31, 3, 11, 2, 36, 2, 22, 35, 3, 41, 8, 12, 17, 7, 15, 31, 3, 7, 14, 5, 12, 8, 3, 16, 2, 15, 2, 19, 2, 26, 2, 3, 20, 2, 11, 5, 6, 8, 3, 7, 14, 5, 12, 4, 3, 36, 2, 7, 25, 31, 3, 18, 2, 11, 5, 3, 14, 4, 19, 4, 16, 5, 6, 31,

Creating a character-based without tashkeel tokenizer

In [7]:
char_tokenizer_without_tashkeel = Tokenizer(char_level=True)
char_tokenizer_without_tashkeel.fit_on_texts(sentences)
char_index_without_tashkeel = char_tokenizer_without_tashkeel.word_index


In [8]:
char_sequences_without_tashkeel = char_tokenizer_without_tashkeel.texts_to_sequences(sentences)
dev_char_sequences_without_tashkeel = char_tokenizer_without_tashkeel.texts_to_sequences(dev_sentences)

In [9]:
print(char_sequences_without_tashkeel[0:1])
print(len(char_index_without_tashkeel.keys()))
print(char_index_without_tashkeel)
print(char_index_with_tashkeel)

[[14, 6, 2, 8, 1, 12, 6, 1, 14, 30, 11, 1, 3, 2, 12, 6, 2, 1, 5, 16, 8, 1, 22, 2, 26, 1, 14, 3, 2, 1, 3, 2, 31, 10, 17, 28, 5]]
38
{' ': 1, 'ل': 2, 'ا': 3, 'م': 4, 'ي': 5, 'و': 6, 'ن': 7, 'ه': 8, 'ب': 9, 'ر': 10, 'ع': 11, 'أ': 12, 'ف': 13, 'ق': 14, 'ت': 15, 'د': 16, 'ك': 17, 'ح': 18, 'س': 19, 'ة': 20, 'ج': 21, 'إ': 22, 'ذ': 23, 'ص': 24, 'ى': 25, 'خ': 26, 'ث': 27, 'ش': 28, 'ض': 29, 'ط': 30, 'ز': 31, 'غ': 32, '.': 33, 'ء': 34, 'ئ': 35, 'ظ': 36, 'آ': 37, 'ؤ': 38}
{'UNK': 1, 'َ': 2, ' ': 3, 'ِ': 4, 'ْ': 5, 'ل': 6, 'ا': 7, 'ُ': 8, 'م': 9, 'ي': 10, 'و': 11, 'ن': 12, 'ه': 13, 'ب': 14, 'ر': 15, 'ع': 16, '٤': 17, 'أ': 18, 'ف': 19, 'ق': 20, 'ت': 21, 'د': 22, 'ك': 23, 'ح': 24, 'س': 25, 'ة': 26, 'ج': 27, 'إ': 28, 'ذ': 29, 'ص': 30, 'ٍ': 31, 'ى': 32, 'خ': 33, 'ث': 34, '٦': 35, 'ش': 36, 'ض': 37, 'ً': 38, 'ٌ': 39, 'ط': 40, 'ز': 41, '٥': 42, 'غ': 43, '.': 44, 'ء': 45, 'ئ': 46, 'ظ': 47, 'آ': 48, 'ؤ': 49, 'ّ': 50, '٣': 51, '٢': 52, '١': 53}


Adding tests for the character-based tokenizer

In [10]:
print(char_index_with_tashkeel)
print(char_tokenizer_with_tashkeel.texts_to_sequences(sentences_new[12].split(" ")))

{'UNK': 1, 'َ': 2, ' ': 3, 'ِ': 4, 'ْ': 5, 'ل': 6, 'ا': 7, 'ُ': 8, 'م': 9, 'ي': 10, 'و': 11, 'ن': 12, 'ه': 13, 'ب': 14, 'ر': 15, 'ع': 16, '٤': 17, 'أ': 18, 'ف': 19, 'ق': 20, 'ت': 21, 'د': 22, 'ك': 23, 'ح': 24, 'س': 25, 'ة': 26, 'ج': 27, 'إ': 28, 'ذ': 29, 'ص': 30, 'ٍ': 31, 'ى': 32, 'خ': 33, 'ث': 34, '٦': 35, 'ش': 36, 'ض': 37, 'ً': 38, 'ٌ': 39, 'ط': 40, 'ز': 41, '٥': 42, 'غ': 43, '.': 44, 'ء': 45, 'ئ': 46, 'ظ': 47, 'آ': 48, 'ؤ': 49, 'ّ': 50, '٣': 51, '٢': 52, '١': 53}
[[11, 2, 20, 2, 7, 6, 2], [36, 2, 10, 5, 33, 8], [7, 6, 5, 28, 4, 25, 5, 6, 2, 7, 9, 4], [18, 2, 10, 5, 37, 38, 7], [11, 2, 20, 2, 22, 5], [25, 8, 46, 4, 6, 5, 21], [16, 2, 12, 5], [12, 2, 47, 2, 15, 4], [13, 2, 29, 4, 13, 4], [7, 6, 5, 9, 2, 25, 5, 18, 2, 6, 2, 26, 4], [11, 2, 13, 8, 11, 2], [15, 2, 27, 8, 6, 39], [21, 2, 16, 2, 15, 17, 37, 2], [6, 4, 7, 9, 5, 15, 2, 18, 2, 26, 4], [43, 2, 10, 5, 15, 4, 13, 4], [19, 2, 41, 2, 12, 2, 32], [14, 4, 13, 2, 7], [34, 8, 9, 17], [21, 2, 7, 14, 2], [9, 4, 12, 5], [29, 2, 6, 4, 23,

In [11]:
# New text data for testing
new_texts = ["مرحبا كيف حالك", "السلام عليكم"]

# Tokenize the new text data at the character level
sequences_new = char_tokenizer_with_tashkeel.texts_to_sequences(new_texts)

# Print the results
for text, sequence in zip(new_texts, sequences_new):
    print(f"Original Text: {text}")
    print(f"Tokenized Sequence: {sequence}\n")

Original Text: مرحبا كيف حالك
Tokenized Sequence: [9, 15, 24, 14, 7, 3, 23, 10, 19, 3, 24, 7, 6, 23]

Original Text: السلام عليكم
Tokenized Sequence: [7, 6, 25, 6, 7, 9, 3, 16, 6, 10, 23, 9]



Addding Padding to the tokenizers

In [12]:

# Assuming word_sequences and char_sequences are the output of the tokenizers
word_sequences = words_tokenizer.texts_to_sequences(sentences)
char_sequences = char_tokenizer_with_tashkeel.texts_to_sequences(sentences_new)


In [13]:

# Add padding
word_sequences_padded = pad_sequences(word_sequences, padding='post')
char_sequences_with_tashkeel_padded = pad_sequences(char_sequences, padding='post')
char_sequences_without_tashkeel_padded = pad_sequences(char_sequences_without_tashkeel, padding='post')

dev_word_sequences_padded = pad_sequences(dev_word_sequences, padding='post')
dev_char_sequences_with_tashkeel_padded = pad_sequences(dev_char_sequences_with_tashkeel, padding='post')
dev_char_sequences_without_tashkeel_padded = pad_sequences(dev_char_sequences_without_tashkeel, padding='post')

In [14]:
print(len(char_sequences[5]))
print(len(char_sequences_with_tashkeel_padded[5]))

114
11740


In [15]:
# Save the tokenized sequences
with open('./pickles/word_sequences.pkl', 'wb') as file:
    pickle.dump(word_sequences_padded, file)

with open('./pickles/char_sequences_with_tashkeel.pkl', 'wb') as file:
    pickle.dump(char_sequences_with_tashkeel_padded, file)

with open('./pickles/char_sequences_without_tashkeel.pkl', 'wb') as file:
    pickle.dump(char_sequences_without_tashkeel_padded, file)

with open('./pickles/val_word_sequences.pkl', 'wb') as file:
    pickle.dump(dev_word_sequences_padded, file)

with open('./pickles/val_char_sequences_with_tashkeel.pkl', 'wb') as file:
    pickle.dump(dev_char_sequences_with_tashkeel_padded, file)

with open('./pickles/val_char_sequences_without_tashkeel.pkl', 'wb') as file:
    pickle.dump(dev_char_sequences_without_tashkeel_padded, file)

TESTING labels


In [16]:
# sentences_with_tashkeel
tashkeel_list = []
dev_tashkeel_list = []
for sentence in sentences_with_tashkeel:
    text, txt_list, harakat_list = util.extract_haraqat(sentence)   
    for i in range(len(harakat_list)):
        if len(harakat_list[i]) == 2:
            if '\u0651\u064B' in harakat_list[i]:
                harakat_list[i] = '١'
            if '\u0651\u064C' in harakat_list[i]:
                harakat_list[i] = '٢'
            if '\u0651\u064D' in harakat_list[i]:
                harakat_list[i] = '٣'
            if '\u0651\u064E' in harakat_list[i]:
                harakat_list[i] = '٤'
            if '\u0651\u064F' in harakat_list[i]:
                harakat_list[i] = '٥'
            if '\u0651\u0650' in harakat_list[i]:
                harakat_list[i] = '٦'

    tashkeel_list.append(harakat_list)

for sentence in dev_sentences_with_tashkeel:
    text, txt_list, harakat_list = util.extract_haraqat(sentence)   
    for i in range(len(harakat_list)):
        if len(harakat_list[i]) == 2:
            if '\u0651\u064B' in harakat_list[i]:
                harakat_list[i] = '١'
            if '\u0651\u064C' in harakat_list[i]:
                harakat_list[i] = '٢'
            if '\u0651\u064D' in harakat_list[i]:
                harakat_list[i] = '٣'
            if '\u0651\u064E' in harakat_list[i]:
                harakat_list[i] = '٤'
            if '\u0651\u064F' in harakat_list[i]:
                harakat_list[i] = '٥'
            if '\u0651\u0650' in harakat_list[i]:
                harakat_list[i] = '٦'

    dev_tashkeel_list.append(harakat_list)


In [17]:
tashkeel_sequences = char_tokenizer_with_tashkeel.texts_to_sequences(tashkeel_list)
tashkeel_sequences_padded = pad_sequences(tashkeel_sequences, padding='post')

with open('tashkeel_sequences.pkl', 'wb') as file:
    pickle.dump(tashkeel_sequences_padded, file)

dev_tashkeel_sequences = char_tokenizer_with_tashkeel.texts_to_sequences(dev_tashkeel_list)
dev_tashkeel_sequences_padded = pad_sequences(dev_tashkeel_sequences, padding='post')

with open('val_tashkeel_sequences.pkl', 'wb') as file:
    pickle.dump(dev_tashkeel_sequences_padded, file)

In [18]:
print(tashkeel_list[0])

['َ', 'ْ', 'ُ', 'ُ', '', 'َ', 'ْ', '', 'َ', 'َ', 'َ', '', '', 'ْ', 'َ', '٤', 'ُ', '', 'َ', 'َ', 'ُ', '', '', 'َ', 'ْ', '', 'َ', '', 'َ', '', '', '', '٤', 'ْ', 'َ', 'ِ', '٥']


In [19]:

print(len(tashkeel_sequences_padded[0]))
print(len(tashkeel_sequences_padded[0]))

7183
7183


In [20]:
print(len(tashkeel_sequences[1]))
print(len(char_sequences_without_tashkeel[1]))

print(char_tokenizer_with_tashkeel.sequences_to_texts([tashkeel_sequences[0]]))
print(char_tokenizer_with_tashkeel.sequences_to_texts([char_sequences_without_tashkeel[0]]))

324
324
['َ ْ ُ ُ UNK َ ْ UNK َ َ َ UNK UNK ْ َ ٤ ُ UNK َ َ ُ UNK UNK َ ْ UNK َ UNK َ UNK UNK UNK ٤ ْ َ ِ ٥']
['ب ل َ ُ UNK ن ل UNK ب ص و UNK   َ ن ل َ UNK ْ ع ُ UNK د َ ة UNK ب   َ UNK   َ ٍ ي ٤ إ ْ']


# Tokenizing Tashkeel only

In [21]:
tashkeel_tokenizer = Tokenizer(char_level=True, oov_token='UNK')
tashkeel_tokenizer.fit_on_texts(tashkeel_list)
tashkeel_index = tashkeel_tokenizer.word_index
tashkeel_list_sequences = tashkeel_tokenizer.texts_to_sequences(tashkeel_list)
dev_tashkeel_list_sequences = tashkeel_tokenizer.texts_to_sequences(dev_tashkeel_list)

tashkeel_list_sequences_padded = pad_sequences(tashkeel_list_sequences, padding='post')
dev_tashkeel_list_sequences_padded = pad_sequences(dev_tashkeel_list_sequences, padding='post')

In [22]:
# print(tashkeel_list_sequences[0:3])
# print(tashkeel_index)
# print(len(tashkeel_list_sequences_padded[0]))

for i in range(len(dev_tashkeel_sequences)):
    if len(dev_tashkeel_sequences[i]) != len(dev_char_sequences_without_tashkeel[i]):
        print(i)
# print(len(dev_tashkeel_list[2]))
# print(len(dev_sentences[2]))

# print(len(dev_tashkeel_sequences[2]))
# print(char_index_without_tashkeel)
# print(len(dev_char_sequences_without_tashkeel[2]))

In [23]:
with open('./pickles/tashkeel_sequences.pkl', 'wb') as file:
    pickle.dump(tashkeel_list_sequences_padded, file)

with open('./pickles/val_tashkeel_sequences.pkl', 'wb') as file:
    pickle.dump(dev_tashkeel_list_sequences_padded, file)

In [24]:
print(tashkeel_list_sequences[0])

[3, 5, 6, 6, 2, 3, 5, 2, 3, 3, 3, 2, 2, 5, 3, 7, 6, 2, 3, 3, 6, 2, 2, 3, 5, 2, 3, 2, 3, 2, 2, 2, 7, 5, 3, 4, 12]


# Tokenizing Diacritics list

In [25]:
with open('./pickles/sentence_diacritics_appearance.pickle', 'rb') as file:
    sentence_diacritics_appearance = pickle.load(file)

In [26]:
with open('./pickles/val_sentence_diacritics_appearance.pickle', 'rb') as file:
    val_sentence_diacritics_appearance = pickle.load(file)

In [27]:
sentence_diacritics_appearance_tokenizer = Tokenizer(oov_token='UNK')
sentence_diacritics_appearance_tokenizer.fit_on_texts(sentence_diacritics_appearance)
sentence_diacritics_appearance_word_index = sentence_diacritics_appearance_tokenizer.word_index
sentence_diacritics_appearance_sequences = sentence_diacritics_appearance_tokenizer.texts_to_sequences(sentence_diacritics_appearance)

#padding
sentence_diacritics_appearance_sequences_padded = pad_sequences(sentence_diacritics_appearance_sequences, padding='post')
print(sentence_diacritics_appearance_word_index)

{'UNK': 1, '111111111111111': 2, '000000000000001': 3, '101010101000001': 4, '111111111010111': 5, '111111111011101': 6, '111111100000001': 7, '111111100000000': 8, '000011000000001': 9, '101111101011111': 10, '111111101010101': 11, '111111000000000': 12, '001000000000001': 13}


In [31]:
val_sentence_diacritics_appearance_sequences = Tokenizer(oov_token='UNK')
val_sentence_diacritics_appearance_sequences.fit_on_texts(val_sentence_diacritics_appearance)
val_sentence_diacritics_appearance_word_index = val_sentence_diacritics_appearance_sequences.word_index
val_sentence_diacritics_appearance_sequences = sentence_diacritics_appearance_tokenizer.texts_to_sequences(val_sentence_diacritics_appearance)

#padding
val_sentence_diacritics_appearance_sequences_padded = pad_sequences(val_sentence_diacritics_appearance_sequences, padding='post')

In [32]:
with open('./pickles/sentence_diacritics_appearance_sequences.pickle', 'wb') as file:
    pickle.dump(sentence_diacritics_appearance_sequences_padded, file)

with open('./pickles/val_sentence_diacritics_appearance_sequences.pickle', 'wb') as file:
    pickle.dump(val_sentence_diacritics_appearance_sequences_padded, file)

: 

In [30]:
print(len(sentence_diacritics_appearance_sequences[0]))

37
