### Feature Extraction

In [2]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from pyarabic.araby import strip_tashkeel

from nltk.tokenize import word_tokenize
import nltk
import qalsadi.lemmatizer 
import qalsadi.analex as qa

from farasa.pos import FarasaPOSTagger 
from farasa.ner import FarasaNamedEntityRecognizer 
from farasa.diacratizer import FarasaDiacritizer 
from farasa.segmenter import FarasaSegmenter 
from farasa.stemmer import FarasaStemmer
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




## Segment words feature

In [3]:
def segment_words(words: list) -> list:
    res = []
    for word in words:
        if len(word) == 1:
            res.append('S')
        else:
            new_word = ''
            new_word += 'B'
            for i in range(1, len(word)-1):
                new_word += 'I'
            new_word += 'E'
            res.append(new_word)
    return res



In [None]:
def segment_sentences(sentence: str) -> list:
    words = sentence.split()
    res = []
    for word in words:
        if word in ['،', '.']:
            res.append(word)
        else:
            if len(word) == 1:
                res.append('S')
            else:
                new_word = ''
                for i in range(0, len(word)):
                    if word[i] in ['،', '.']:
                        new_word += word[i]
                    elif i== 0:
                        new_word += 'B'
                    elif i == len(word)-1:
                        new_word += 'E'
                    else:
                        new_word += 'I'
                res.append(new_word)
    return res

In [4]:
train_words_stripped = []

train_data = None
with open('./Dataset/training/train_words_stripped.txt', 'r', encoding='utf-8') as file:
    train_data = file.readlines()
for line in train_data:
    train_words_stripped.append(line.strip())


In [5]:
segmented_train_words = segment_words(train_words_stripped)


In [6]:
print(segmented_train_words[:10])
print(train_words_stripped[9::-1]) #reverse so English and Arabic align (only printing purpose)
print(train_words_stripped[0])
print(segmented_train_words[0])
print(train_words_stripped[1])
print(segmented_train_words[1])



['BIIE', 'BE', 'BIE', 'BIIIE', 'BIE', 'BIE', 'BIE', 'BIIIIIE', 'BIE', 'BIIE']
['عرفة', 'ابن', 'الزركشي', 'قال', 'إلخ', 'يده', 'الأول', 'قطع', 'أو', 'قوله']
قوله
BIIE
أو
BE


# Diacritics
# {  ْ   , ّ   ,  ً   ,  َ   ,    ُ   ,  ِ    ,  ٍ   , ٌ    }

# Golden
## { َ  : 0, ً : 1, ُ : 2, ٌ : 3, ِ  : 4, ٍ  : 5, ْ : 6, ّ  : 7, ّ َ  : 8, ّ ً : 9, ّ ُ : 10, ّ ٌ : 11, ّ ِ  : 12,  ّ ٍ : 13, '': 14}

In [7]:

with open('./pickles/diacritics.pickle', 'rb') as file:
    diacritics = pickle.load(file)

with open('./pickles/diacritic2id.pickle', 'rb') as file:
    diacritic2id = pickle.load(file)

with open('./pickles/arabic_letters.pickle', 'rb') as file:
    arabic_letters = pickle.load(file)

print(diacritics)
print(diacritic2id)
print(arabic_letters)

print(len(arabic_letters))


{'َ', 'ُ', 'ً', 'ِ', 'ٍ', 'ّ', 'ْ', 'ٌ'}
{'َ': 0, 'ً': 1, 'ُ': 2, 'ٌ': 3, 'ِ': 4, 'ٍ': 5, 'ْ': 6, 'ّ': 7, 'َّ': 8, 'ًّ': 9, 'ُّ': 10, 'ٌّ': 11, 'ِّ': 12, 'ٍّ': 13, '': 14}
{'ل', 'ض', 'ي', 'ا', 'ح', 'ش', 'أ', 'ز', 'غ', 'ظ', 'ء', 'ب', 'ؤ', 'و', 'خ', 'ر', 'إ', 'م', 'ذ', 'ى', 'ن', 'ئ', 'ق', 'ص', 'ت', 'ط', 'ج', 'ك', 'ث', 'ف', 'ع', 'آ', 'س', 'د', 'ة', 'ه'}
36


In [8]:
#this function is used to get a dictionary of letters and a binary value if a certain dicritic appears after it
#length of letters is 36 (28 letters + 8 special characters) and length of dicritics is 14
#the function returns a dictionary of letters and a list of 14 binary values
#utf-8 encoding for letters is used
#for double diacritics we will checkk for arabic numerals 
#١ is shadda + tanween fatha
#٢ is shadda + tanween damma
#٣ is shadda + tanween kasra
#٤ is shadda + fatha
#٥ is shadda + damma
#٦ is shadda + kasra

#diacritic2id has 15 keys and values from 0 to 14 of the diacritics + "" (none)
#arabic_letters has 36 keys and values from 0 to 35 of the letters

with open('./Dataset/training/train_words_replaced.txt', 'r', encoding='utf-8') as file:
    train_replace = file.readlines()
list_of_words = []
for sentence in train_replace:
    list_of_words.append(sentence.strip())

def get_letter_diacritics_appearance(list_of_words: list) -> dict:
    dictionary = {}
    for letters in arabic_letters:
        dictionary[letters] = [0 for i in range(15)]


    for word in list_of_words:
        for i in range(len(word)):
            if word[i] in arabic_letters:
                if word[i] not in dictionary:# if the letter is not in the dictionary (mesh mohem awy laken mesh damen el dataset be amana)
                    dictionary[word[i]] = [0 for i in range(15)]
                if i+1 < len(word):
                    if word[i+1] in diacritics:
                        dictionary[word[i]][diacritic2id[word[i+1]]] = 1
                    elif word[i+1] == '١':
                        dictionary[word[i]][9] = 1
                    elif word[i+1] == '٢':
                        dictionary[word[i]][11] = 1
                    elif word[i+1] == '٣':
                        dictionary[word[i]][13] = 1
                    elif word[i+1] == '٤':
                        dictionary[word[i]][8] = 1
                    elif word[i+1] == '٥':
                        dictionary[word[i]][10] = 1
                    elif word[i+1] == '٦':
                        dictionary[word[i]][12] = 1
                    elif word[i+1] not in diacritics:
                        dictionary[word[i]][14] = 1
    
    for key in dictionary:
        dictionary[key] = ''.join(map(str, dictionary[key]))

    return dictionary
    


# Golden
## { َ  : 0, ً : 1, ُ : 2, ٌ : 3, ِ  : 4, ٍ  : 5, ْ : 6, ّ  : 7, ّ َ  : 8, ّ ً : 9, ّ ُ : 10, ّ ٌ : 11, ّ ِ  : 12,  ّ ٍ : 13, '': 14}

In [9]:

dictionary = get_letter_diacritics_appearance(list_of_words)

with open('./pickles/letter_diacritics_appearance.pickle', 'wb') as file:
    pickle.dump(dictionary, file)


print(dictionary)
print(len(dictionary.keys()))

{'ل': '111111111111111', 'ض': '111111111111111', 'ي': '111111111111111', 'ا': '101010101000001', 'ح': '111111111010111', 'ش': '111111111111111', 'أ': '111111100000001', 'ز': '111111111111111', 'غ': '111111101010101', 'ظ': '111111111111111', 'ء': '111111000000000', 'ب': '111111111111111', 'ؤ': '111111100000000', 'و': '111111111111111', 'خ': '111111111010111', 'ر': '111111111111111', 'إ': '000011000000001', 'م': '111111111111111', 'ذ': '111111111111111', 'ى': '101111101011111', 'ن': '111111111111111', 'ئ': '111111100000001', 'ق': '111111111111111', 'ص': '111111111111111', 'ت': '111111111111111', 'ط': '111111111111111', 'ج': '111111111111111', 'ك': '111111111111111', 'ث': '111111111111111', 'ف': '111111111111111', 'ع': '111111111010111', 'آ': '001000000000001', 'س': '111111111111111', 'د': '111111111111111', 'ة': '111111100000000', 'ه': '111111111011101'}
36


In [10]:
# for each sentence in in stripped sentences for each letter in the sentence we put its corresponding diacritic appearance list in a list
# so we have a list of list of lists

with open('./Dataset/training/train_stripped.txt', 'r', encoding='utf-8') as file:
    train_sentences_replace = file.readlines()
    
list_of_sentences = []
for sentence in train_sentences_replace:
    list_of_sentences.append(sentence.strip())

print(list_of_sentences[0])


قوله أو قطع الأول يده إلخ قال الزركشي


In [12]:
with open('./Dataset/val/val_stripped.txt', 'r', encoding='utf-8') as file:
    val_sentences_replace = file.readlines()
    
val_list_of_sentences = []
for sentence in val_sentences_replace:
    val_list_of_sentences.append(sentence.strip())

In [13]:
def get_sentence_diacritics_appearance(list_of_sentences: list) -> list:
    list_of_diactitics_appearance_in_sentences = []
    for sentence in list_of_sentences:
        string_of_diactitics_appearance_in_sentence = ""
        for letter in sentence:
            if letter in arabic_letters:
                string_of_diactitics_appearance_in_sentence += dictionary[letter]+" "
            else:
                string_of_diactitics_appearance_in_sentence += '0'*14+'1'+" "
        list_of_diactitics_appearance_in_sentences.append(string_of_diactitics_appearance_in_sentence.strip())
        
    return list_of_diactitics_appearance_in_sentences

In [14]:
sentence_diacritics_appearance = get_sentence_diacritics_appearance(list_of_sentences)
with open('./pickles/sentence_diacritics_appearance.pickle', 'wb') as file:
    pickle.dump(sentence_diacritics_appearance, file)

In [15]:
val_sentence_diacritics_appearance = get_sentence_diacritics_appearance(val_list_of_sentences)
with open('./pickles/val_sentence_diacritics_appearance.pickle', 'wb') as file:
    pickle.dump(val_sentence_diacritics_appearance, file)

In [16]:
print(sentence_diacritics_appearance[0:2])

['111111111111111 111111111111111 111111111111111 111111111011101 000000000000001 111111100000001 111111111111111 000000000000001 111111111111111 111111111111111 111111111010111 000000000000001 101010101000001 111111111111111 111111100000001 111111111111111 111111111111111 000000000000001 111111111111111 111111111111111 111111111011101 000000000000001 000011000000001 111111111111111 111111111010111 000000000000001 111111111111111 101010101000001 111111111111111 000000000000001 101010101000001 111111111111111 111111111111111 111111111111111 111111111111111 111111111111111 111111111111111', '101010101000001 111111111111111 111111111111111 000000000000001 111111111010111 111111111111111 111111111111111 111111100000000 000000000000001 111111111111111 111111111111111 111111111111111 111111111011101 000000000000001 111111111111111 111111111111111 111111111111111 111111111111111 000000000000001 111111111111111 111111111111111 111111111111111 111111111111111 111111111111111 111111111011101 000

: 