In [123]:
import re
import pickle

In [169]:
OTHER = ' '
DIACRITICS = [OTHER, "َ", "ً", "ُ", "ٌ", "ِ", "ٍ", "ْ", "ّ", "َّ", "ًّ", "ُّ", "ٌّ", "ِّ", "ٍّ"]
VOWEL_REGEX = re.compile('|'.join(DIACRITICS))
SENTENCE_WINDOW = 600
WINDOW_SIZE_BEFORE = 2
WINDOW_SIZE_AFTER = 3

MAIN_DIACRITICS = None
with open("./utils/diacritics.pickle","rb") as file:
    MAIN_DIACRITICS = list(pickle.load(file))

PUNCTUATIONS = [".", "،", ":", "؛", "؟"]
SPLITTING_PATTERN = re.compile(r"[\.،:؛!؟]")

In [125]:
def readFile(path):
	sentences = []
	with open(path, 'r', encoding='utf-8') as file:
		for line in file:
			sentences.append(line.strip())

	return sentences

In [186]:
'''
Getting Basic Arabic Letters based on their unicodes
'''
basic_arabic_start1 = 0x0621
basic_arabic_end1 = 0x063A
basic_arabic_start2 = 0x0641
basic_arabic_end2 = 0x064A

basic_arabic_letters = None
with open("./utils/arabic_letters.pickle","rb") as file:
    basic_arabic_letters = list(pickle.load(file))

VALID_ARABIC_CHARS = basic_arabic_letters + MAIN_DIACRITICS  + PUNCTUATIONS +[' ']
VALID_ARABIC_CHARS_WITHOUT_PUNCTUATION = basic_arabic_letters + MAIN_DIACRITICS +[' ']

In [127]:
char_to_index = {char: i + 1 for i, char in enumerate(basic_arabic_letters)}
diacritic_to_index = {diacritic: i for i, diacritic in enumerate(DIACRITICS)}
index_to_diacritic = {index: diacritic for diacritic, index in diacritic_to_index.items()}

In [128]:
WHITESPACES_PATTERN = re.compile("\s+")
def combine_whitespaces(text):
    return re.sub(WHITESPACES_PATTERN, " ",text)

In [129]:
def get_valid_arabic_letters(text):
    text = list(filter(lambda char: char in basic_arabic_letters,text))
    return combine_whitespaces(''.join(text))

In [130]:
def get_valid_arabic_text(text):
    text = list(filter(lambda char: char in VALID_ARABIC_CHARS,text))
    return combine_whitespaces(''.join(text))

In [131]:
def get_valid_arabic_text_without_punctuation(text):
    text = list(filter(lambda char: char in VALID_ARABIC_CHARS_WITHOUT_PUNCTUATION,text))
    return combine_whitespaces(''.join(text))

def separate_words_to_char(sentence):
    sentence = get_valid_arabic_text_without_punctuation(sentence)

    letters = []
    for word in sentence.split():
        for char in word:
            letters.append(char)

    return letters

In [170]:
def get_sentences_window(sentence):
    #Fel bdaya ehna 3ayzeen n2smha 600 char 600 char
    startIdx = 0
    sentences = []
    sentence = get_valid_arabic_text_without_punctuation(sentence)

    while(startIdx < len(sentence) and startIdx != -1):

        finalIdx = startIdx + SENTENCE_WINDOW if startIdx + SENTENCE_WINDOW < len(sentence) else -1

        #print("startIdx: ",startIdx,"lastIdx: ",finalIdx)

        pre_modified_sentence = sentence[startIdx:finalIdx]
        

        startIdx = finalIdx
        
        sentences.append(pre_modified_sentence)

    return sentences

In [144]:
def get_all_windows(sentence):
    windows =[]
    sentence = get_valid_arabic_text_without_punctuation(sentence)
    sentence = sentence.split()
    for i in range(len(sentence)):
        start_idx = max(0,i-WINDOW_SIZE_BEFORE)
        last_index = min(len(sentence),i+WINDOW_SIZE_AFTER+1)
        windows.append(sentence[start_idx:last_index])
    return windows

In [163]:
def get_splitted_sentences(sentence):
    sentence = get_valid_arabic_text(sentence)
    #return [item.strip() for item in re.split(SPLITTING_PATTERN, sentence)]
    windows = get_sentences_window(sentence)
    return windows

In [146]:
def separate_words_and_diacritics(sentence):
    sentences = get_splitted_sentences(sentence)
    
    final_chars = []
    final_diacritics = []

    for sentence in sentences:
        output_chars = []
        output_diacritics = []
        for word in sentence.split():
            letters = []
            diacritics = []
            prev_char = word[0]
            for idx, char in enumerate(word[1:]):
                try:
                    next_char = word[idx + 1 + 1]
                except IndexError:
                    next_char = ''
                if char in DIACRITICS:
                    if prev_char not in DIACRITICS:
                        letters.append(prev_char)
                        if next_char == '' or next_char not in DIACRITICS:
                            diacritics.append(char)
                        elif next_char in DIACRITICS:
                            diacritics.append(char + next_char)
                else:
                    if prev_char not in DIACRITICS:
                        letters.append(prev_char)
                        diacritics.append(OTHER)
                    if next_char == '':
                        letters.append(char)
                        diacritics.append(OTHER)
                prev_char = char

            if len(letters):
                output_chars.append(letters)
                output_diacritics.append(diacritics)

        final_chars.append([char for word in output_chars for char in word])
        final_diacritics.append([diacritic for word in output_diacritics for diacritic in word])

    final_chars = [item for item in final_chars if len(item)]
    final_diacritics = [item for item in final_diacritics if len(item)]
    return final_chars, final_diacritics

In [185]:
sentence = ("وَ ) أَنْ يَجِدَ ( مَعَ ) بُعْدِ مَحَلِّ جِهَادٍ ( مَسَافَةِ قَصْرِ ) فَأَكْثَرَ مِنْ بَلَدِهِ ( مَا يَحْمِلُهُ ) لِقَوْلِهِ تَعَالَى : { وَلَا عَلَى الَّذِينَ إذَا مَا أَتَوْكَ لِتَحْمِلَهُمْ قُلْتَ لَا أَجِدُ مَا أَحْمِلُكُمْ عَلَيْهِ } الْآيَةَ .")

#char, dia = separate_words_and_diacritics(sentence)
sentences = get_splitted_sentences(sentence)
for i,_ in enumerate(sentences):
    print(sentences[i])

# windows = get_all_windows(sentence)

# for window in windows :
#     print(window)
#     print("________________")

وَأَنْيَجِدَمَعَبُعْدِمَحَلِّجِهَادٍمَسَافَةِقَصْرِفَأَكْثَرَمِنْبَلَدِهِمَايَحْمِلُهُلِقَوْلِهِتَعَالَىوَلَاعَلَىالَّذِينَإذَامَاأَتَوْكَلِتَحْمِلَهُمْقُلْتَلَاأَجِدُمَاأَحْمِلُكُمْعَلَيْهِالْآيَة


In [190]:
training_letters = 0
preprocessed_letters = 0

TEST_PATH = "../dataset/val.txt"


#calculating training letters

test_corpus = readFile(TEST_PATH)

X_test = []
total_len = 0
cnt = 0

print(test_corpus[10])
print(get_splitted_sentences(test_corpus[10]))

looping_cnt = 0
for sentence in test_corpus[10:11]:
    char_list, _ = separate_words_and_diacritics(sentence.strip())
    char_list_second = get_valid_arabic_letters(sentence.strip())

    for i in range(len(char_list)):
        cnt+=len(char_list[i])



'''
real_cnt = 0
for sentence in test_corpus[10:11]:
    char_list = get_valid_arabic_letters(sentence.strip())
    real_cnt+=len(char_list)

print(real_cnt)
'''
print(cnt)


( قَوْلُهُ وَبِشَجَرَةٍ مَا يَدْخُلُ إلَخْ ) عَطْفٌ عَلَى قَوْلِهِ بِدَابَّةٍ نَحْوِ حَمْلٍ إلَخْ ا ه سم ( قَوْلُهُ وَيَجِبُ بَقَاؤُهُ إلَخْ ) أَيْ بِخِلَافِ الثَّمَرَةِ الْمُؤَبَّرَةِ وَقْتَ الْوَصِيَّةِ وَالْحَادِثَةِ بَعْدَهَا قَبْلَ مَوْتِ الْمُوصِي فَإِنَّهَا لِلْوَارِثِ ا ه ع ش ( قَوْلُهُ بَقَاؤُهُ ) عِبَارَةُ النِّهَايَةِ إبْقَاؤُهُ مِنْ الْأَفْعَالِ وَهِيَ أَحْسَنُ ( قَوْلُهُ وَنَظِيرُ إلَخْ ) مُبْتَدَأٌ خَبَرُهُ قَوْلُهُ مَا لَوْ أَوْصَى إلَخْ ( قَوْلُهُ اعْتِبَارِ الْوَصِيَّةِ ) أَيْ وَقْتِهَا ( قَوْلُهُ وَهِيَ ) أَيْ الْوَصِيَّةُ مُبْتَدَأٌ وَقَوْلُهُ بِمَا تَحْمِلُهُ أَيْ كُلٌّ مِنْ الدَّابَّةِ وَالشَّجَرَةِ مُتَعَلِّقٌ بِهِ ، وَقَوْلُهُ لِكُلِّ حَمْلٍ أَيْ شَامِلٍ لَهُ خَبَرُهُ عِبَارَةُ الْمُغْنِي وَإِذَا أَوْصَى بِمَا يَحْدُثُ هَذَا الْعَامَ أَوْ كُلَّ عَامٍ عُمِلَ بِهِ ، وَإِنْ أَطْلَقَ فَقَالَ أَوْصَيْت بِمَا يَحْدُثُ فَهَلْ يَعُمُّ كُلَّ سَنَةٍ أَوْ يَخْتَصُّ بِالسَّنَةِ الْأُولَى قَالَ ابْنُ الرِّفْعَةِ الظَّاهِرُ الْعُمُومُ ، وَسَكَتَ عَلَيْهِ السُّبْكِيُّ وَهُوَ ظَ

In [171]:
sentence = ("قَوْلُهُ : ( وَبَحَثَ الرَّافِعِيُّ صِحَّتَهَا ) وَإِنْ قَصَدَ تَمْلِيكَ الْمَسْجِدِ وَهُوَ الْمُعْتَمَدُ ، وَعُلِمَ مِنْ تَعْلِيلِهِ بِأَنَّ( 9 / 482 )")
print(len(sentence))
#char, dia = separate_words_and_diacritics(sentence)
sentences = get_splitted_sentences(sentence)
print(sentences)

# windows = get_all_windows(sentence)

# for window in windows :
#     print(window)
#     print("________________")

for i,splitted_sentence in enumerate(sentences):
    print(splitted_sentence)
    print(len(splitted_sentence))
    print("************")



152
['قَوْلُهُوَبَحَثَالرَّافِعِيُّصِحَّتَهَاوَإِنْقَصَدَتَمْلِيكَالْمَسْجِدِوَهُوَالْمُعْتَمَدُوَعُلِمَمِنْتَعْلِيلِهِبِأَنّ']
قَوْلُهُوَبَحَثَالرَّافِعِيُّصِحَّتَهَاوَإِنْقَصَدَتَمْلِيكَالْمَسْجِدِوَهُوَالْمُعْتَمَدُوَعُلِمَمِنْتَعْلِيلِهِبِأَنّ
119
************
