### Feature Extraction

In [9]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from pyarabic.araby import strip_tashkeel

from nltk.tokenize import word_tokenize
import nltk
import qalsadi.lemmatizer 
import qalsadi.analex as qa

from farasa.pos import FarasaPOSTagger 
from farasa.ner import FarasaNamedEntityRecognizer 
from farasa.diacratizer import FarasaDiacritizer 
from farasa.segmenter import FarasaSegmenter 
from farasa.stemmer import FarasaStemmer
import pickle


## Segment words feature

In [2]:
def segment_words(words: list) -> list:
    res = []
    for word in words:
        if len(word) == 1:
            res.append('S')
        else:
            new_word = ''
            new_word += 'B'
            for i in range(1, len(word)-1):
                new_word += 'I'
            new_word += 'E'
            res.append(new_word)
    return res


In [3]:
train_words_stripped = []

train_data = None
with open('./Dataset/training/train_words_stripped.txt', 'r', encoding='utf-8') as file:
    train_data = file.readlines()
for line in train_data:
    train_words_stripped.append(line.strip())


In [5]:
segmented_train_words = segment_words(train_words_stripped)


['BIIE', 'BE', 'BIE', 'BIIIE', 'BIE', 'BIE', 'BIE', 'BIIIIIE', 'BIE', 'BIIE']
['عرفة', 'ابن', 'الزركشي', 'قال', 'إلخ', 'يده', 'الأول', 'قطع', 'أو', 'قوله']
قوله
BIIE


In [7]:
print(segmented_train_words[:10])
print(train_words_stripped[9::-1]) #reverse so English and Arabic align (only printing purpose)
print(train_words_stripped[0])
print(segmented_train_words[0])
print(train_words_stripped[1])
print(segmented_train_words[1])



['BIIE', 'BE', 'BIE', 'BIIIE', 'BIE', 'BIE', 'BIE', 'BIIIIIE', 'BIE', 'BIIE']
['عرفة', 'ابن', 'الزركشي', 'قال', 'إلخ', 'يده', 'الأول', 'قطع', 'أو', 'قوله']
قوله
BIIE
أو
BE


# Diacritics
# {  ْ   , ّ   ,  ً   ,  َ   ,    ُ   ,  ِ    ,  ٍ   , ٌ    }

# Golden
## { َ  : 0, ً : 1, ُ : 2, ٌ : 3, ِ  : 4, ٍ  : 5, ْ : 6, ّ  : 7, ّ َ  : 8, ّ ً : 9, ّ ُ : 10, ّ ٌ : 11, ّ ِ  : 12,  ّ ٍ : 13, '': 14}

In [14]:

with open('./pickles/diacritics.pickle', 'rb') as file:
    diacritics = pickle.load(file)

with open('./pickles/diacritic2id.pickle', 'rb') as file:
    diacritic2id = pickle.load(file)

with open('./pickles/arabic_letters.pickle', 'rb') as file:
    arabic_letters = pickle.load(file)

print(diacritics)
print(diacritic2id)
print(arabic_letters)

print(len(arabic_letters))


{'ْ', 'ّ', 'ً', 'َ', 'ُ', 'ِ', 'ٍ', 'ٌ'}
{'َ': 0, 'ً': 1, 'ُ': 2, 'ٌ': 3, 'ِ': 4, 'ٍ': 5, 'ْ': 6, 'ّ': 7, 'َّ': 8, 'ًّ': 9, 'ُّ': 10, 'ٌّ': 11, 'ِّ': 12, 'ٍّ': 13, '': 14}
{'ت', 'ث', 'ك', 'ح', 'ب', 'ن', 'غ', 'م', 'ص', 'إ', 'ع', 'س', 'آ', 'ف', 'خ', 'ة', 'ض', 'أ', 'ق', 'ظ', 'ج', 'ى', 'ئ', 'ؤ', 'ء', 'ذ', 'ز', 'ش', 'ا', 'ط', 'د', 'ل', 'و', 'ه', 'ر', 'ي'}
36


In [15]:
#this function is used to get a dictionary of letters and a binary value if a certain dicritic appears after it
#length of letters is 36 (28 letters + 8 special characters) and length of dicritics is 14
#the function returns a dictionary of letters and a list of 14 binary values
#utf-8 encoding for letters is used
#for double diacritics we will checkk for arabic numerals 
#١ is shadda + tanween fatha
#٢ is shadda + tanween damma
#٣ is shadda + tanween kasra
#٤ is shadda + fatha
#٥ is shadda + damma
#٦ is shadda + kasra

#diacritic2id has 15 keys and values from 0 to 14 of the diacritics + "" (none)
#arabic_letters has 36 keys and values from 0 to 35 of the letters

with open('./Dataset/training/train_words_replaced.txt', 'r', encoding='utf-8') as file:
    train_replace = file.readlines()
list_of_words = []
for sentence in train_replace:
    list_of_words.append(sentence.strip())

def get_letter_diacritics_appearance(list_of_words: list) -> dict:
    dictionary = {}
    for word in list_of_words:
        for i in range(len(word)):
            if word[i] in arabic_letters:
                if word[i] not in dictionary:
                    dictionary[word[i]] = [0 for i in range(15)]
                if (i+1 < len(word) and word[i+1] in diacritics) and (i+2 < len(word) and word[i+2] not in diacritics):
                    dictionary[word[i]][diacritic2id[word[i+1]]] = 1
                elif i+2 < len(word) and word[i+2] in diacritics:
                    if word[i+2] == '١':
                        dictionary[word[i]][9] = 1
                    elif word[i+2] == '٢':
                        dictionary[word[i]][11] = 1
                    elif word[i+2] == '٣':
                        dictionary[word[i]][13] = 1
                    elif word[i+2] == '٤':
                        dictionary[word[i]][8] = 1
                    elif word[i+2] == '٥':
                        dictionary[word[i]][10] = 1
                    elif word[i+2] == '٦':
                        dictionary[word[i]][12] = 1
                    else:
                        print("love")


                

             
        

    
    
    

['قَوْلُهُ أَوْ قَطَعَ الْأَو٤لُ يَدَهُ إلَخْ قَالَ الز٤رْكَشِي٥', 'ابْنُ عَرَفَةَ قَوْلُهُ بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الد٦ينِ ضَرُورَةً كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَد٦ زُن٤ارٍ ابْنُ عَرَفَةَ قَوْلُ ابْنِ شَاسٍ أَوْ بِفِعْلٍ يَتَضَم٤نُهُ هُوَ كَلُبْسِ الز٥ن٤ارِ وَإِلْقَاءِ الْمُصْحَفِ فِي صَرِيحِ الن٤جَاسَةِ وَالس٥جُودِ لِلص٤نَمِ وَنَحْوِ ذَلِكَ وَسِحْرٍ مُحَم٤دٌ قَوْلُ مَالِكٍ وَأَصْحَابِهِ أَن٤ الس٤احِرَ كَافِرٌ بِاَلل٤هِ تَعَالَى قَالَ مَالِكٌ هُوَ كَالز٦نْدِيقِ إذَا عَمِلَ الس٦حْرَ بِنَفْسِهِ قُتِلَ وَلَمْ يُسْتَتَبْ .', 'قَوْلُهُ لِعَدَمِ مَا تَتَعَل٤قُ إلَخْ أَيْ الْوَصِي٤ةُ قَوْلُهُ مَا مَر٤ أَيْ قُبَيْلَ قَوْلِ الْمَتْنِ لَغَتْ وَلَوْ اقْتَصَرَ عَلَى أَوْصَيْت لَهُ بِشَاةٍ أَوْ أَعْطُوهُ شَاةً وَلَا غَنَمَ لَهُ عِنْدَ الْمَوْتِ هَلْ تَبْطُلُ الْوَصِي٤ةُ أَوْ يُشْتَرَى لَهُ شَاةٌ وَيُؤْخَذُ مِنْ قَوْلِهِ الْآتِي كَمَا لَوْ لَمْ يَقُلْ مِنْ مَالِي وَلَا مِنْ غَنَمِي أَن٤هَا لَا تَبْطُلُ ، وَعِبَارَةُ الْكَن