In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.tokenize import word_tokenize
import nltk
import pyarabic.araby as araby
from pyarabic.araby import strip_tashkeel
import qalsadi.lemmatizer 
import qalsadi.analex as qa
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
from farasa.pos import FarasaPOSTagger 
from farasa.ner import FarasaNamedEntityRecognizer 
from farasa.diacratizer import FarasaDiacritizer 
from farasa.segmenter import FarasaSegmenter 
from farasa.stemmer import FarasaStemmer

import keras
from  diacritization_evaluation import util

In [106]:
!pip install diacritization-evaluation

Collecting diacritization-evaluation
  Downloading diacritization_evaluation-0.5-py3-none-any.whl (7.2 kB)
Installing collected packages: diacritization-evaluation
Successfully installed diacritization-evaluation-0.5


In [55]:
words = []
sentences = []
sentences_with_tashkeel = []
with open('./Dataset/WordsWithoutTashkeel.txt', 'r', encoding='utf-8') as output_file:
    for word in output_file:
        words.append(word.strip())

with open('./Dataset/SentencesWithoutTashkeel.txt', 'r', encoding='utf-8') as output_file:
    for sentence in output_file:
        sentences.append(sentence.strip())

with open('./Dataset/sentences.txt', 'r', encoding='utf-8') as output_file:
    for sentence in output_file:
        sentences_with_tashkeel.append(sentence.strip())

print(words[0:10])
print(sentences[0])
print(sentences_with_tashkeel[0])


['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']
قوله أو قطع الأول يده إلخ قال الزركشي ابن عرفة قوله بلفظ يقتضيه كإنكار غير حديث بالإسلام وجوب ما علم وجوبه من الدين ضرورة كإلقاء مصحف بقذر وشد زنار ابن عرفة قول ابن شاس أو بفعل يتضمنه هو كلبس الزنار وإلقاء المصحف في صريح النجاسة والسجود للصنم ونحو ذلك وسحر محمد قول مالك وأصحابه أن الساحر كافر بالله تعالى قال مالك هو كالزنديق إذا عمل السحر بنفسه قتل ولم يستتب
قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ ابْنُ عَرَفَةَ قَوْلُهُ بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الدِّينِ ضَرُورَةً كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَدِّ زُنَّارٍ ابْنُ عَرَفَةَ قَوْلُ ابْنِ شَاسٍ أَوْ بِفِعْلٍ يَتَضَمَّنُهُ هُوَ كَلُبْسِ الزُّنَّارِ وَإِلْقَاءِ الْمُصْحَفِ فِي صَرِيحِ النَّجَاسَةِ وَالسُّجُودِ لِلصَّنَمِ وَنَحْوِ ذَلِكَ وَسِحْرٍ مُحَمَّدٌ قَوْلُ مَالِكٍ وَأَصْحَابِهِ أَنَّ السَّاحِرَ كَافِرٌ بِاَللَّهِ تَعَالَى قَالَ مَالِكٌ هُوَ كَالزِّنْدِيقِ إذَا 

In [56]:
print(sentences_with_tashkeel[0])

قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ قَالَ الزَّرْكَشِيُّ ابْنُ عَرَفَةَ قَوْلُهُ بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الدِّينِ ضَرُورَةً كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَدِّ زُنَّارٍ ابْنُ عَرَفَةَ قَوْلُ ابْنِ شَاسٍ أَوْ بِفِعْلٍ يَتَضَمَّنُهُ هُوَ كَلُبْسِ الزُّنَّارِ وَإِلْقَاءِ الْمُصْحَفِ فِي صَرِيحِ النَّجَاسَةِ وَالسُّجُودِ لِلصَّنَمِ وَنَحْوِ ذَلِكَ وَسِحْرٍ مُحَمَّدٌ قَوْلُ مَالِكٍ وَأَصْحَابِهِ أَنَّ السَّاحِرَ كَافِرٌ بِاَللَّهِ تَعَالَى قَالَ مَالِكٌ هُوَ كَالزِّنْدِيقِ إذَا عَمِلَ السِّحْرَ بِنَفْسِهِ قُتِلَ وَلَمْ يُسْتَتَبْ


Creating a word-based tokenizer

In [57]:
# Create a tokenizer
words_tokenizer = Tokenizer()

# Fit the tokenizer on the list of words (treat each word as a separate "sentence")
words_tokenizer.fit_on_texts(sentences)

# Get the word index
word_index = words_tokenizer.word_index

# Tokenize the words
word_sequences = words_tokenizer.texts_to_sequences(sentences)

Testing the tokenizer on a sample sentence

In [58]:
# Create a sentence tokenizer
print((sentences[0].split(" "))) #This way works and the one used below as well
print(word_index["قوله"])
print(words_tokenizer.texts_to_sequences([sentences[0]]))

['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة', 'قوله', 'بلفظ', 'يقتضيه', 'كإنكار', 'غير', 'حديث', 'بالإسلام', 'وجوب', 'ما', 'علم', 'وجوبه', 'من', 'الدين', 'ضرورة', 'كإلقاء', 'مصحف', 'بقذر', 'وشد', 'زنار', 'ابن', 'عرفة', 'قول', 'ابن', 'شاس', 'أو', 'بفعل', 'يتضمنه', 'هو', 'كلبس', 'الزنار', 'وإلقاء', 'المصحف', 'في', 'صريح', 'النجاسة', 'والسجود', 'للصنم', 'ونحو', 'ذلك', 'وسحر', 'محمد', 'قول', 'مالك', 'وأصحابه', 'أن', 'الساحر', 'كافر', 'بالله', 'تعالى', 'قال', 'مالك', 'هو', 'كالزنديق', 'إذا', 'عمل', 'السحر', 'بنفسه', 'قتل', 'ولم', 'يستتب']
8
[[8, 4, 273, 91, 189, 47, 14, 901, 37, 458, 8, 1184, 3641, 41299, 39, 249, 4154, 475, 12, 163, 1738, 3, 158, 1175, 27872, 6099, 41300, 5827, 41301, 37, 458, 63, 37, 2652, 4, 1362, 24279, 43, 17883, 57670, 41302, 2961, 2, 1430, 2148, 9577, 57671, 1189, 18, 21681, 86, 63, 94, 3359, 6, 16422, 1190, 1185, 108, 14, 94, 43, 27873, 28, 520, 8288, 415, 264, 51, 57672]]


Creating a character-based tokenizer

In [59]:
char_tokenizer = Tokenizer(char_level=True)
char_tokenizer.fit_on_texts(sentences_with_tashkeel)
char_index = char_tokenizer.word_index
char_sequences = char_tokenizer.texts_to_sequences(sentences)
print(char_sequences[0:5])
print(char_index.get("."))


[[19, 10, 5, 12, 2, 17, 10, 2, 19, 39, 16, 2, 6, 5, 17, 10, 5, 2, 9, 21, 12, 2, 27, 5, 33, 2, 19, 6, 5, 2, 6, 5, 40, 15, 22, 36, 9, 2, 6, 14, 11, 2, 16, 15, 18, 25, 2, 19, 10, 5, 12, 2, 14, 5, 18, 44, 2, 9, 19, 20, 38, 9, 12, 2, 22, 27, 11, 22, 6, 15, 2, 41, 9, 15, 2, 23, 21, 9, 34, 2, 14, 6, 5, 27, 24, 5, 6, 8, 2, 10, 26, 10, 14, 2, 8, 6, 2, 16, 5, 8, 2, 10, 26, 10, 14, 12, 2, 8, 11, 2, 6, 5, 21, 9, 11, 2, 38, 15, 10, 15, 25, 2, 22, 27, 5, 19, 6, 42, 2, 8, 30, 23, 18, 2, 14, 19, 29, 15, 2, 10, 36, 21, 2, 40, 11, 6, 15, 2, 6, 14, 11, 2, 16, 15, 18, 25, 2, 19, 10, 5, 2, 6, 14, 11, 2, 36, 6, 24, 2, 17, 10, 2, 14, 18, 16, 5, 2, 9, 20, 38, 8, 11, 12, 2, 12, 10, 2, 22, 5, 14, 24, 2, 6, 5, 40, 11, 6, 15, 2, 10, 27, 5, 19, 6, 42, 2, 6, 5, 8, 30, 23, 18, 2, 18, 9, 2, 30, 15, 9, 23, 2, 6, 5, 11, 26, 6, 24, 25, 2, 10, 6, 5, 24, 26, 10, 21, 2, 5, 5, 30, 11, 8, 2, 10, 11, 23, 10, 2, 29, 5, 22, 2, 10, 24, 23, 15, 2, 8, 23, 8, 21, 2, 19, 10, 5, 2, 8, 6, 5, 22, 2, 10, 17, 30, 23, 6, 14, 12, 2, 17, 11

Adding tests for the character-based tokenizer

In [60]:
print(char_index["ط"])
print(char_tokenizer.texts_to_sequences(sentences[0].split(" ")))


39
[[19, 10, 5, 12], [17, 10], [19, 39, 16], [6, 5, 17, 10, 5], [9, 21, 12], [27, 5, 33], [19, 6, 5], [6, 5, 40, 15, 22, 36, 9], [6, 14, 11], [16, 15, 18, 25], [19, 10, 5, 12], [14, 5, 18, 44], [9, 19, 20, 38, 9, 12], [22, 27, 11, 22, 6, 15], [41, 9, 15], [23, 21, 9, 34], [14, 6, 5, 27, 24, 5, 6, 8], [10, 26, 10, 14], [8, 6], [16, 5, 8], [10, 26, 10, 14, 12], [8, 11], [6, 5, 21, 9, 11], [38, 15, 10, 15, 25], [22, 27, 5, 19, 6, 42], [8, 30, 23, 18], [14, 19, 29, 15], [10, 36, 21], [40, 11, 6, 15], [6, 14, 11], [16, 15, 18, 25], [19, 10, 5], [6, 14, 11], [36, 6, 24], [17, 10], [14, 18, 16, 5], [9, 20, 38, 8, 11, 12], [12, 10], [22, 5, 14, 24], [6, 5, 40, 11, 6, 15], [10, 27, 5, 19, 6, 42], [6, 5, 8, 30, 23, 18], [18, 9], [30, 15, 9, 23], [6, 5, 11, 26, 6, 24, 25], [10, 6, 5, 24, 26, 10, 21], [5, 5, 30, 11, 8], [10, 11, 23, 10], [29, 5, 22], [10, 24, 23, 15], [8, 23, 8, 21], [19, 10, 5], [8, 6, 5, 22], [10, 17, 30, 23, 6, 14, 12], [17, 11], [6, 5, 24, 6, 23, 15], [22, 6, 18, 15], [14, 6, 

In [61]:
# New text data for testing
new_texts = ["مرحبا كيف حالك", "السلام عليكم"]

# Tokenize the new text data at the character level
sequences_new = char_tokenizer.texts_to_sequences(new_texts)

# Print the results
for text, sequence in zip(new_texts, sequences_new):
    print(f"Original Text: {text}")
    print(f"Tokenized Sequence: {sequence}\n")

Original Text: مرحبا كيف حالك
Tokenized Sequence: [8, 15, 23, 14, 6, 2, 22, 9, 18, 2, 23, 6, 5, 22]

Original Text: السلام عليكم
Tokenized Sequence: [6, 5, 24, 5, 6, 8, 2, 16, 5, 9, 22, 8]



Addding Padding to the tokenizers

In [62]:

# Assuming word_sequences and char_sequences are the output of the tokenizers
word_sequences = words_tokenizer.texts_to_sequences(sentences)
char_sequences = char_tokenizer.texts_to_sequences(sentences_with_tashkeel)

# Add padding
word_sequences_padded = pad_sequences(word_sequences, padding='post')
char_sequences_padded = pad_sequences(char_sequences, padding='post')

In [63]:
sequence_lengths = [len(seq) for seq in word_sequences_padded]
print(len(char_sequences[5]))
print(len(char_sequences_padded[5]))

165
4320


In [64]:
# Save the tokenized sequences
with open('word_sequences.pkl', 'wb') as file:
    pickle.dump(word_sequences_padded, file)

with open('char_sequences.pkl', 'wb') as file:
    pickle.dump(char_sequences_padded, file)

In [65]:
#Testing if pickle works
with open('word_sequences.pkl', 'rb') as file:
    sequences_padded = pickle.load(file)

print(sequences_padded[0])
print(word_sequences_padded[0])

[    8     4   273    91   189    47    14   901    37   458     8  1184
  3641 41299    39   249  4154   475    12   163  1738     3   158  1175
 27872  6099 41300  5827 41301    37   458    63    37  2652     4  1362
 24279    43 17883 57670 41302  2961     2  1430  2148  9577 57671  1189
    18 21681    86    63    94  3359     6 16422  1190  1185   108    14
    94    43 27873    28   520  8288   415   264    51 57672     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [66]:
print(sequences_padded[3])
print(word_sequences_padded[3])

[   32     7  1380  1110    88    21   100     6  2543   180     3    39
  7575   798     6   179   147    18     2 16423     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [142]:
with open('char_sequences.pkl', 'rb') as file:
    char_sequences = pickle.load(file)

print(char_index.get("ُ"))
w = char_sequences[1]
print(char_tokenizer.sequences_to_texts(["19 2 3 4 5 6 7 8"]))

7
['']


In [None]:
print(char_index)

TESTING

In [75]:
word = "فِي"

utf8_encoding_numbers = [ord(char) for char in word]
print("UTF-8 Encoding Numbers:", utf8_encoding_numbers)

UTF-8 Encoding Numbers: [1608, 1614, 1588, 1614, 1583, 1617, 1616]


In [104]:
letters, harakat = araby.separate("قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُت")
print(letters)
print(len(harakat))

قوله أو قطع الأوّلت
َُُْـَْـَََــََُْْـ


In [115]:


text, txt_list, haraqat_list = util.extract_haraqat("قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ") 

print(text)
print(txt_list)
haraqat_list
for i in haraqat_list:
    print(len(i))   


قَوْلُهُ أَوْ قَطَعَ الْأَوَّلُ
['ق', 'و', 'ل', 'ه', ' ', 'أ', 'و', ' ', 'ق', 'ط', 'ع', ' ', 'ا', 'ل', 'أ', 'و', 'ل']
1
1
1
1
0
1
1
0
1
1
1
0
0
1
1
2
1
