In [50]:
import os
from enum import Enum
import re
import numpy as np
from pyarabic.araby import separate, tokenize, is_arabicrange, strip_tashkeel
import nltk
from nltk.tokenize import sent_tokenize
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential,load_model
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from gensim.models import Word2Vec


### reading and writing files

In [None]:
class FileReader:
    
    # write the data inside the file with file_name
    def write_file(self,file_name, data):
        if not os.path.exists("dataset"):
            os.makedirs("dataset")
        # Combine folder and file path
        file_path = os.path.join("dataset", file_name)
        
        # Write the cleaned data to a new text file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(data)

    # opne the file with file_name, extract the file data, and return it 
    def open_file(self, file_name):
        file_path = os.path.join("dataset", file_name)
        f = open(file_path, 'r', encoding="utf-8").read()
        return f


### data cleaning and processing utilities 


In [51]:
class ArabicDiacritics(Enum):
    FATHATAN = '\u064b'
    DAMMATAN = '\u064c'
    KASRATAN = '\u064d'
    FATHA = '\u064e'
    DAMMA = '\u064f'
    KASRA = '\u0650'
    SHADDA = '\u0651'
    SUKUN = '\u0652'

class Preprocessor:

    def __init__(self):
        f = FileReader()


    # clean the arabic text from any non arabic characters and store the clean data inside an new output file
    def clean_data(self, data,output_file):
        tokens = tokenize(data, conditions=is_arabicrange)
        cleaned_data = u" ".join(tokens)
        f.write_file(output_file, cleaned_data)


    # remove diacritics from arabic text and store the new data inside a new file
    def remove_diacritics(self, data, output_file):
        data_with_diactrics = strip_tashkeel(data)
        f.write_file(output_file, data_with_diactrics)


    # remove all punctuation characters and store the result inside the output file
    def remove_tarkeem(self, data, output_file):
        arabic_punctuation = ['،', '٪', '؛', '؟', 'ـ']
        english_punctuation = [',', '.', '%', ':', ';', '?', '!', '-', '_', "'", '"', '(', ')', '[', ']', '{', '}']
        data_without_tarkeem = ""
        for character in data:
            if character not in arabic_punctuation and character not in english_punctuation:
                data_without_tarkeem += character
        f.write_file(output_file, data_without_tarkeem)


    def separate_diacritics(self, arabic_text):
        diacritics_list = []
        # internal function to replace diacritics with empty strings for the letters
        def diacritics_replacement(match):
            diacritic = match.group(0)
            diacritics_list.append(diacritic)
            return ''  
        
        # Define a pattern to match Arabic diacritics and shadda using the enum values
        diacritics_pattern = re.compile("|".join([re.escape(diacritic.value) for diacritic in ArabicDiacritics]))

        # Remove diacritics and shadda using the pattern and store them in the list
        result_text = diacritics_pattern.sub(diacritics_replacement, arabic_text)

        return result_text, diacritics_list
    

P1 = Preprocessor()
arabic_text = "الذِّمِّيِّ أَنْ يَحْتَسِبَ عَلَى الْمُسْلِم"
result, diacritics = P1.separate_diacritics(arabic_text)
print(result)
print(diacritics)

for diacritic_value in diacritics:
    for diacritic in ArabicDiacritics:
        if diacritic.value == diacritic_value:
            print(f"Extracted Diacritic: {diacritic.name}")

f = FileReader()
data = f.open_file("train.txt")
P1.clean_data(data, "clean_data.txt")

data = f.open_file("clean_data.txt")
P1.remove_diacritics(data, "no_diacritics.txt")


الذمي أن يحتسب على المسلم
['ّ', 'ِ', 'ّ', 'ِ', 'ّ', 'ِ', 'َ', 'ْ', 'َ', 'ْ', 'َ', 'ِ', 'َ', 'َ', 'َ', 'ْ', 'ُ', 'ْ', 'ِ']
Extracted Diacritic: SHADDA
Extracted Diacritic: KASRA
Extracted Diacritic: SHADDA
Extracted Diacritic: KASRA
Extracted Diacritic: SHADDA
Extracted Diacritic: KASRA
Extracted Diacritic: FATHA
Extracted Diacritic: SUKUN
Extracted Diacritic: FATHA
Extracted Diacritic: SUKUN
Extracted Diacritic: FATHA
Extracted Diacritic: KASRA
Extracted Diacritic: FATHA
Extracted Diacritic: FATHA
Extracted Diacritic: FATHA
Extracted Diacritic: SUKUN
Extracted Diacritic: DAMMA
Extracted Diacritic: SUKUN
Extracted Diacritic: KASRA
