In [2]:
import re
import pickle

In [22]:
OTHER = ' '
DIACRITICS = [OTHER, "َ", "ً", "ُ", "ٌ", "ِ", "ٍ", "ْ", "ّ", "َّ", "ًّ", "ُّ", "ٌّ", "ِّ", "ٍّ"]
VOWEL_REGEX = re.compile('|'.join(DIACRITICS))
SENTENCE_WINDOW = 800
WINDOW_SIZE_BEFORE = 2
WINDOW_SIZE_AFTER = 3

MAIN_DIACRITICS = None
with open("./utils/diacritics.pickle","rb") as file:
    MAIN_DIACRITICS = list(pickle.load(file))

PUNCTUATIONS = [".", "،", ":", "؛", "؟"]
SPLITTING_PATTERN = re.compile(r"[\.،:؛!؟]")

In [23]:
def readFile(path):
	sentences = []
	with open(path, 'r', encoding='utf-8') as file:
		for line in file:
			sentences.append(line.strip())

	return sentences

In [24]:
'''
Getting Basic Arabic Letters based on their unicodes
'''
basic_arabic_start1 = 0x0621
basic_arabic_end1 = 0x063A
basic_arabic_start2 = 0x0641
basic_arabic_end2 = 0x064A

basic_arabic_letters = None
with open("./utils/arabic_letters.pickle","rb") as file:
    basic_arabic_letters = list(pickle.load(file))

VALID_ARABIC_CHARS = basic_arabic_letters + MAIN_DIACRITICS  + PUNCTUATIONS +[' ']
VALID_ARABIC_CHARS_WITHOUT_PUNCTUATION = basic_arabic_letters + MAIN_DIACRITICS +[' ']

In [25]:
char_to_index = {char: i + 1 for i, char in enumerate(basic_arabic_letters)}
diacritic_to_index = {diacritic: i for i, diacritic in enumerate(DIACRITICS)}
index_to_diacritic = {index: diacritic for diacritic, index in diacritic_to_index.items()}

In [26]:
WHITESPACES_PATTERN = re.compile("\s+")
def combine_whitespaces(text):
    return re.sub(WHITESPACES_PATTERN, " ",text)

In [27]:
def get_valid_arabic_letters(text):
    text = list(filter(lambda char: char in basic_arabic_letters,text))
    return combine_whitespaces(''.join(text))

In [28]:
def get_valid_arabic_text(text):
    text = list(filter(lambda char: char in VALID_ARABIC_CHARS,text))
    return combine_whitespaces(''.join(text))

In [29]:
def get_valid_arabic_text_without_punctuation(text):
    text = list(filter(lambda char: char in VALID_ARABIC_CHARS_WITHOUT_PUNCTUATION,text))
    return combine_whitespaces(''.join(text))

def separate_words_to_char(sentence):
    sentence = get_valid_arabic_text_without_punctuation(sentence)

    letters = []
    for word in sentence.split():
        for char in word:
            letters.append(char)

    return letters

In [30]:
def get_sentences_window(sentence):
    #Fel bdaya ehna 3ayzeen n2smha 600 char 600 char
    startIdx = 0
    sentences = []
    sentence = get_valid_arabic_text_without_punctuation(sentence)
    # print(sentence)
    while(startIdx < len(sentence) and startIdx != -1):

        finalIdx = startIdx + SENTENCE_WINDOW if startIdx + SENTENCE_WINDOW < len(sentence) else -1
        
        # print(startIdx)
        # print(finalIdx)
        # print(len(sentence))
        if finalIdx != -1 and finalIdx < len(sentence) and sentence[finalIdx] in MAIN_DIACRITICS:
            finalIdx -= 1

        while(finalIdx != -1 and sentence[finalIdx] != ' '):
            finalIdx-=1
        #print("startIdx: ",startIdx,"lastIdx: ",finalIdx)
        
        pre_modified_sentence = sentence[startIdx:finalIdx] if finalIdx != -1 else sentence[startIdx:len(sentence)]

        startIdx = finalIdx
        
        sentences.append(pre_modified_sentence)

    return sentences

In [31]:
def get_all_windows(sentence):
    windows =[]
    sentence = get_valid_arabic_text_without_punctuation(sentence)
    sentence = sentence.split()
    for i in range(len(sentence)):
        start_idx = max(0,i-WINDOW_SIZE_BEFORE)
        last_index = min(len(sentence),i+WINDOW_SIZE_AFTER+1)
        windows.append(sentence[start_idx:last_index])
    return windows

In [32]:
def get_splitted_sentences(sentence):
    sentence = get_valid_arabic_text(sentence)
    #return [item.strip() for item in re.split(SPLITTING_PATTERN, sentence)]
    windows = get_sentences_window(sentence)
    return windows

In [33]:
def separate_words_and_diacritics(sentence):
    sentences = get_splitted_sentences(sentence)
    final_chars = []
    final_diacritics = []

    for sentence in sentences:
        output_chars = []
        output_diacritics = []
        for word in sentence.split():
            letters = []
            diacritics = []
            prev_char = word[0]
            if len(word) == 1:
                letters.append(prev_char)
                diacritics.append(OTHER)
            else:
                for idx, char in enumerate(word[1:]):
                    try:
                        next_char = word[idx + 1 + 1]
                    except IndexError:
                        next_char = ''
                    if char in DIACRITICS:
                        if prev_char not in DIACRITICS:
                            letters.append(prev_char)
                            if next_char == '' or next_char not in DIACRITICS:
                                diacritics.append(char)
                            elif next_char in DIACRITICS:
                                # print(char+next_char)
                                diacritics.append(char + next_char)
                    else:
                        if prev_char not in DIACRITICS:
                            letters.append(prev_char)
                            diacritics.append(OTHER)
                        if next_char == '':
                            letters.append(char)
                            diacritics.append(OTHER)
                    prev_char = char

            if len(letters):
                output_chars.append(letters)
                output_diacritics.append(diacritics)

        final_chars.append([char for word in output_chars for char in word])
        final_diacritics.append([diacritic for word in output_diacritics for diacritic in word])

    final_chars = [item for item in final_chars if len(item)]
    final_diacritics = [item for item in final_diacritics if len(item)]
    return final_chars, final_diacritics

### Test Sizes

In [38]:
def check_data_sizes():
    training_letters = 0
    preprocessed_letters = 0

    TEST_PATH = "../dataset/train.txt"

    #calculating training letters

    test_corpus = readFile(TEST_PATH)

    X_test = []
    total_len = 0
    cnt = 0

    # print(test_corpus[10])
    # print(get_splitted_sentences(test_corpus[10]))

    # START = 3 END = 4 IS BUGGY

    START = 1612
    END = 1613

    looping_cnt = 0
    for j, sentence in enumerate(test_corpus):
        char_list, _ = separate_words_and_diacritics(sentence.strip())
        char_list_second = get_valid_arabic_letters(sentence.strip())
        for i in range(len(char_list)):
            for char_wanted in char_list[i]:
                # print("ely ana bgebu:" ,char_wanted, "->", "ely hwa bygebu :",char_list_second[looping_cnt])
                # print("-------------------")
                looping_cnt+=1
            cnt+=len(char_list[i])


    real_cnt = 0
    for sentence in test_corpus:
        # print(sentence)
        char_list = get_valid_arabic_letters(sentence.strip())
        real_cnt+=len(char_list)

    print(real_cnt)
    print(cnt)

In [39]:
# check_data_sizes()

8351478
8351478
