In [2]:
import re

In [3]:
OTHER = ' '
DIACRITICS = [OTHER, "َ", "ً", "ُ", "ٌ", "ِ", "ٍ", "ْ", "ّ", "َّ", "ًّ", "ُّ", "ٌّ", "ِّ", "ٍّ"]
VOWEL_REGEX = re.compile('|'.join(DIACRITICS))
SENTENCE_WINDOW = 1100

MAIN_DIACRITICS = ["ْ", "ّ", "ٌ", "ٍ", "ِ", "ً", "َ", "ُ"]
PUNCTUATIONS = [".", "،", ":", "؛", "؟"]
SPLITTING_PATTERN = re.compile(r"[\.،:؛!؟]")

In [4]:
def readFile(path):
	sentences = []
	with open(path, 'r', encoding='utf-8') as file:
		for line in file:
			sentences.append(line.strip())

	return sentences

In [5]:
'''
Getting Basic Arabic Letters based on their unicodes
'''
basic_arabic_start1 = 0x0621
basic_arabic_end1 = 0x063A
basic_arabic_start2 = 0x0641
basic_arabic_end2 = 0x064A

basic_arabic_letters = [chr(code) for code in range(basic_arabic_start1, basic_arabic_end1 + 1)] + \
                       [chr(code) for code in range(basic_arabic_start2, basic_arabic_end2 + 1)]

VALID_ARABIC_CHARS = basic_arabic_letters + MAIN_DIACRITICS + [' '] + PUNCTUATIONS
VALID_ARABIC_CHARS_WITHOUT_PUNCTUATION = basic_arabic_letters + MAIN_DIACRITICS + [' ']

In [6]:
char_to_index = {char: i + 1 for i, char in enumerate(basic_arabic_letters)}
diacritic_to_index = {diacritic: i for i, diacritic in enumerate(DIACRITICS)}

In [7]:
WHITESPACES_PATTERN = re.compile("\s+")
def combine_whitespaces(text):
    return re.sub(WHITESPACES_PATTERN, " ",text).strip()

In [8]:
def get_valid_arabic_text(text):
    text = list(filter(lambda char: char in VALID_ARABIC_CHARS,text))
    return combine_whitespaces(''.join(text))

In [9]:
def get_valid_arabic_text_without_punctuation(text):
    text = list(filter(lambda char: char in VALID_ARABIC_CHARS_WITHOUT_PUNCTUATION,text))
    return combine_whitespaces(''.join(text))

In [10]:
def get_sentences_window(sentence):
    #Fel bdaya ehna 3ayzeen n2smha 600 char 600 char
    startIdx = 0
    sentences = []
    sentence = get_valid_arabic_text_without_punctuation(sentence)
    while(startIdx < len(sentence)):
        finalIdx = startIdx + SENTENCE_WINDOW if startIdx + SENTENCE_WINDOW < len(sentence) else -1
        pre_modified_sentence = sentence[startIdx:finalIdx]
        startIdx+=SENTENCE_WINDOW

        while(len(pre_modified_sentence) > 0 and pre_modified_sentence[-1]!=' '):
            pre_modified_sentence = pre_modified_sentence[0:-2]
            startIdx-=1
        sentences.append(pre_modified_sentence)
    return sentences


In [11]:
def get_splitted_sentences(sentence):
    sentence = get_valid_arabic_text(sentence)
    #return [item.strip() for item in re.split(SPLITTING_PATTERN, sentence)]
    return get_sentences_window(sentence)

In [12]:
def separate_words_and_diacritics(sentence):
    sentences = get_splitted_sentences(sentence)
    
    final_chars = []
    final_diacritics = []

    for sentence in sentences:
        output_chars = []
        output_diacritics = []
        for word in sentence.split():
            letters = []
            diacritics = []
            prev_char = word[0]
            for idx, char in enumerate(word[1:]):
                try:
                    next_char = word[idx + 1 + 1]
                except IndexError:
                    next_char = ''
                if char in DIACRITICS:
                    if prev_char not in DIACRITICS:
                        letters.append(prev_char)
                        if next_char == '' or next_char not in DIACRITICS:
                            diacritics.append(char)
                        elif next_char in DIACRITICS:
                            diacritics.append(char + next_char)
                else:
                    if prev_char not in DIACRITICS:
                        letters.append(prev_char)
                        diacritics.append(OTHER)
                    if next_char == '':
                        letters.append(char)
                        diacritics.append(OTHER)
                prev_char = char

            if len(letters):
                output_chars.append(letters)
                output_diacritics.append(diacritics)

        final_chars.append([char for word in output_chars for char in word])
        final_diacritics.append([diacritic for word in output_diacritics for diacritic in word])

    final_chars = [item for item in final_chars if len(item)]
    final_diacritics = [item for item in final_diacritics if len(item)]
    return final_chars, final_diacritics

In [15]:
# sentence = ("( قَوْلُهُ لِعَدَمِ مَا تَتَعَلَّقُ إلَخْ ) أَيْ الْوَصِيَّةُ ( قَوْلُهُ مَا مَرَّ ) أَيْ قُبَيْلَ قَوْلِ الْمَتْنِ لَغَتْ وَلَوْ اقْتَصَرَ عَلَى أَوْصَيْت لَهُ بِشَاةٍ أَوْ أَعْطُوهُ شَاةً وَلَا غَنَمَ لَهُ عِنْدَ الْمَوْتِ هَلْ تَبْطُلُ الْوَصِيَّةُ أَوْ يُشْتَرَى لَهُ شَاةٌ وَيُؤْخَذُ مِنْ قَوْلِهِ الْآتِي كَمَا لَوْ لَمْ يَقُلْ مِنْ مَالِي وَلَا مِنْ غَنَمِي أَنَّهَا لَا تَبْطُلُ ، وَعِبَارَةُ الْكَنْزِ وَلَوْ لَمْ يَقُلْ مِنْ مَالِي وَلَا مِنْ غَنَمِي لَمْ يَتَعَيَّنْ غَنَمُهُ إنْ كَانَتْ انْتَهَتْ ا ه سم ( قَوْلُهُ فَيُعْطَى وَاحِدَةً مِنْهَا إلَخْ ) كَمَا لَوْ كَانَتْ مَوْجُودَةً عِنْدَ الْوَصِيَّةِ وَالْمَوْتِ ، وَلَا يَجُوزُ أَنْ يُعْطَى وَاحِدَةً مِنْ غَيْرِ غَنَمِهِ فِي الصُّورَتَيْنِ وَإِنْ تَرَاضَيَا ؛ لِأَنَّهُ صُلْحٌ عَلَى مَجْهُولٍ مُغْنِي وَنِهَايَةٌ قَالَ ع ش قَوْلُهُ وَاحِدَةً مِنْهَا أَيْ كَامِلَةً ، وَلَا يَجُوزُ أَنْ يُعْطَى نِصْفَيْنِ مِنْ شَاتَيْنِ ؛ لِأَنَّهُ لَا يُسَمَّى شَاةً وَقَوْلُهُ وَلَا يَجُوزُ أَنْ يُعْطَى وَاحِدَةً مِنْ غَيْرِ غَنَمِهِ وَيَنْبَغِي أَنْ يُقَالَ مِثْلُ ذَلِكَ فِي الْأَرِقَّاءِ ا ه .")
# sentence = "قَوْلُهُ : ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ ) قَالَ الزَّرْكَشِيُّ( 14 / 123 )"
sentence = "ابْنُ عَرَفَةَ : قَوْلُهُ : بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الدِّينِ ضَرُورَةً ( كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَدِّ زُنَّارٍ ) ابْنُ عَرَفَةَ : قَوْلُ ابْنِ شَاسٍ : أَوْ بِفِعْلٍ يَتَضَمَّنُهُ هُوَ كَلُبْسِ الزُّنَّارِ وَإِلْقَاءِ الْمُصْحَفِ فِي صَرِيحِ النَّجَاسَةِ وَالسُّجُودِ لِلصَّنَمِ وَنَحْوِ ذَلِكَ ( وَسِحْرٍ ) مُحَمَّدٌ : قَوْلُ مَالِكٍ وَأَصْحَابِهِ أَنَّ السَّاحِرَ كَافِرٌ بِاَللَّهِ تَعَالَى قَالَ مَالِكٌ : هُوَ كَالزِّنْدِيقِ إذَا عَمِلَ السِّحْرَ بِنَفْسِهِ قُتِلَ وَلَمْ يُسْتَتَبْ ."
char, dia = separate_words_and_diacritics(sentence)
# sentences = get_splitted_sentences(sentence)

# for splitted_sentence in sentences:
#     print(splitted_sentence)
#     print(len(splitted_sentence))
#     print("************")
for i in range(len(char)):
	print(len(char[i]))
	print(char[i])
	print("-"*10)

253
['ا', 'ب', 'ن', 'ع', 'ر', 'ف', 'ة', 'ق', 'و', 'ل', 'ه', 'ب', 'ل', 'ف', 'ظ', 'ي', 'ق', 'ت', 'ض', 'ي', 'ه', 'ك', 'إ', 'ن', 'ك', 'ا', 'ر', 'غ', 'ي', 'ر', 'ح', 'د', 'ي', 'ث', 'ب', 'ا', 'ل', 'إ', 'س', 'ل', 'ا', 'م', 'و', 'ج', 'و', 'ب', 'م', 'ا', 'ع', 'ل', 'م', 'و', 'ج', 'و', 'ب', 'ه', 'م', 'ن', 'ا', 'ل', 'د', 'ي', 'ن', 'ض', 'ر', 'و', 'ر', 'ة', 'ك', 'إ', 'ل', 'ق', 'ا', 'ء', 'م', 'ص', 'ح', 'ف', 'ب', 'ق', 'ذ', 'ر', 'و', 'ش', 'د', 'ز', 'ن', 'ا', 'ر', 'ا', 'ب', 'ن', 'ع', 'ر', 'ف', 'ة', 'ق', 'و', 'ل', 'ا', 'ب', 'ن', 'ش', 'ا', 'س', 'أ', 'و', 'ب', 'ف', 'ع', 'ل', 'ي', 'ت', 'ض', 'م', 'ن', 'ه', 'ه', 'و', 'ك', 'ل', 'ب', 'س', 'ا', 'ل', 'ز', 'ن', 'ا', 'ر', 'و', 'إ', 'ل', 'ق', 'ا', 'ء', 'ا', 'ل', 'م', 'ص', 'ح', 'ف', 'ف', 'ي', 'ص', 'ر', 'ي', 'ح', 'ا', 'ل', 'ن', 'ج', 'ا', 'س', 'ة', 'و', 'ا', 'ل', 'س', 'ج', 'و', 'د', 'ل', 'ل', 'ص', 'ن', 'م', 'و', 'ن', 'ح', 'و', 'ذ', 'ل', 'ك', 'و', 'س', 'ح', 'ر', 'م', 'ح', 'م', 'د', 'ق', 'و', 'ل', 'م', 'ا', 'ل', 'ك', 'و', 'أ', 'ص', 'ح', 'ا', 'ب', 'ه', 'أ', 'ن', 'ا', 'ل', 