In [108]:
import os
from enum import Enum
import re
import numpy as np
from pyarabic.araby import separate, tokenize, is_arabicrange, strip_tashkeel, strip_tatweel
import nltk
from nltk.tokenize import sent_tokenize
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential,load_model
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from gensim.models import Word2Vec

### reading and writing files

In [109]:
class FileReader:
    
    # write the data inside the file with file_name
    def write_file(self,file_name, data):
        if not os.path.exists("dataset"):
            os.makedirs("dataset")
        # Combine folder and file path
        file_path = os.path.join("dataset", file_name)
        
        # Write the cleaned data to a new text file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(data)

    # opne the file with file_name, extract the file data, and return it 
    def open_file(self, file_name):
        file_path = os.path.join("dataset", file_name)
        f = open(file_path, 'r', encoding="utf-8").read()
        return f


### data cleaning and processing utilities 


In [119]:
class ArabicDiacritics(Enum):
    SHADDA_FATHATAN = '\u0651\u064b'
    SHADDA_DAMMATAN = '\u0651\u064c'
    SHADDA_KASRATAN = '\u0651\u064d'
    SHADDA_FATHA = '\u0651\u064e'
    SHADDA_DAMMA = '\u0651\u064f'
    SHADDA_KASRA = '\u0651\u0650'
    SHADDA_SUKUN = '\u0651\u0652'
    SHADDA = '\u0651'
    FATHATAN = '\u064b'
    DAMMATAN = '\u064c'
    KASRATAN = '\u064d'
    FATHA = '\u064e'
    DAMMA = '\u064f'
    KASRA = '\u0650'
    SUKUN = '\u0652'

class ArabicCharacters(Enum):
    HAMZA = u'\u0621'
    ALEF_MADDA = u'\u0622'
    ALEF_HAMZA_ABOVE = u'\u0623'
    WAW_HAMZA = u'\u0624'
    ALEF_HAMZA_BELOW = u'\u0625'
    YEH_HAMZA = u'\u0626'
    ALEF = u'\u0627'
    BEH = u'\u0628'
    TEH_MARBUTA = u'\u0629'
    TEH = u'\u062a'
    THEH = u'\u062b'
    JEEM = u'\u062c'
    HAH = u'\u062d'
    KHAH = u'\u062e'
    DAL = u'\u062f'
    THAL = u'\u0630'
    REH = u'\u0631'
    ZAIN = u'\u0632'
    SEEN = u'\u0633'
    SHEEN = u'\u0634'
    SAD = u'\u0635'
    DAD = u'\u0636'
    TAH = u'\u0637'
    ZAH = u'\u0638'
    AIN = u'\u0639'
    GHAIN = u'\u063a'
    TATWEEL = u'\u0640'
    FEH = u'\u0641'
    QAF = u'\u0642'
    KAF = u'\u0643'
    LAM = u'\u0644'
    MEEM = u'\u0645'
    NOON = u'\u0646'
    HEH = u'\u0647'
    WAW = u'\u0648'
    ALEF_MAKSURA = u'\u0649'
    YEH = u'\u064a'

class Preprocessor:

    def __init__(self):
        f = FileReader()


    # clean the arabic text from any non arabic characters and store the clean data inside an new output file
    def clean_data(self, data,output_file):
        tokens = tokenize(data, conditions=is_arabicrange)
        cleaned_data = u" ".join(tokens)
        f.write_file(output_file, cleaned_data)


    # remove diacritics from arabic text and store the new data inside a new file
    def remove_diacritics(self, data, output_file):
        data_with_diactrics = strip_tashkeel(data)
        f.write_file(output_file, data_with_diactrics)


    # remove all punctuation characters and store the result inside the output file
    def remove_tarkeem(self, data, output_file):
        arabic_punctuation = ['،', '٪', '؛', '؟', 'ـ']
        english_punctuation = [',', '.', '%', ':', ';', '?', '!', '-', '_', "'", '"', '(', ')', '[', ']', '{', '}']
        data_without_tarkeem = ""
        for character in data:
            if character not in arabic_punctuation and character not in english_punctuation:
                data_without_tarkeem += character
        f.write_file(output_file, data_without_tarkeem)

    def separate_diacritics(self, arabic_text):
        diacritics_list = []
        # internal function to replace diacritics with empty strings for the letters
        def diacritics_replacement(match):
            diacritic = match.group(2)
            diacritics_list.append(diacritic)
            return match.group(1) 
        
        # Define a pattern to match Arabic diacritics and shadda using the enum values
        diacritics_pattern = re.compile("([" + "".join([re.escape(character.value) for character in ArabicCharacters]) + " ])" + "([" + "".join([re.escape(diacritic.value) for diacritic in ArabicDiacritics]) + "]*)|(.)")

        # Remove diacritics and shadda using the pattern and store them in the list
        result_text = re.sub(diacritics_pattern, diacritics_replacement, arabic_text)

        return result_text, diacritics_list
    

P1 = Preprocessor()
arabic_text = "الذِّمِّيِّ أَنْ يَحْتَسِبَ عَلَى الْمُسْلِم"
result, diacritics = P1.separate_diacritics(arabic_text)
print(result)
print(diacritics)

for diacritic_value in diacritics:
    if diacritic_value in [diacritic.value for diacritic in ArabicDiacritics]:
        for diacritic in ArabicDiacritics:
            if diacritic.value == diacritic_value:
                print(f"Extracted Diacritic: {diacritic.name}")
    else:
        print(f"Extracted Diacritic: No Diacritic")

f = FileReader()
data1 = f.open_file("train.txt")
P1.remove_tarkeem(data1, "no_tarkeem.txt")

data2 = f.open_file("no_tarkeem.txt")
P1.clean_data(data2, "clean_data.txt")

data3 = f.open_file("clean_data.txt")
P1.remove_diacritics(data3, "no_diacritics.txt")

الذمي أن يحتسب على المسلم
['', '', 'ِّ', 'ِّ', 'ِّ', '', 'َ', 'ْ', '', 'َ', 'ْ', 'َ', 'ِ', 'َ', '', 'َ', 'َ', '', '', '', 'ْ', 'ُ', 'ْ', 'ِ', '']
Extracted Diacritic: No Diacritic
Extracted Diacritic: No Diacritic
Extracted Diacritic: SHADDA_KASRA
Extracted Diacritic: SHADDA_KASRA
Extracted Diacritic: SHADDA_KASRA
Extracted Diacritic: No Diacritic
Extracted Diacritic: FATHA
Extracted Diacritic: SUKUN
Extracted Diacritic: No Diacritic
Extracted Diacritic: FATHA
Extracted Diacritic: SUKUN
Extracted Diacritic: FATHA
Extracted Diacritic: KASRA
Extracted Diacritic: FATHA
Extracted Diacritic: No Diacritic
Extracted Diacritic: FATHA
Extracted Diacritic: FATHA
Extracted Diacritic: No Diacritic
Extracted Diacritic: No Diacritic
Extracted Diacritic: No Diacritic
Extracted Diacritic: SUKUN
Extracted Diacritic: DAMMA
Extracted Diacritic: SUKUN
Extracted Diacritic: KASRA
Extracted Diacritic: No Diacritic
{(':', b':'), ("'", b"'"), (')', b')'), ('ن', b'\xd9\x86'), ('ى', b'\xd9\x89'), ('؟', b'\xd8\x9