In [1]:
import os
from enum import Enum
import re
import numpy as np
from pyarabic.araby import separate, tokenize, is_arabicrange, strip_tashkeel, strip_tatweel
import nltk
from nltk.tokenize import sent_tokenize
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential,load_model, Model
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, CategoryEncoding, Bidirectional, Input, Dropout, TimeDistributed
from keras.initializers import glorot_normal
from gensim.models import Word2Vec

from chars_enums import *
from file_reader import FileReader
from preprocessor import Preprocessor

2023-12-31 17:25:44.708910: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-31 17:25:44.708968: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-31 17:25:44.710988: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-31 17:25:44.717898: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
P1 = Preprocessor()
arabic_text = "الذِّمِّيِّ أَنْ يَحْتَسِبَ عَلَى الْمُسْلِم"
result, diacritics = P1.separate_diacritics(arabic_text)
print(result)
print(diacritics)

for diacritic_value in diacritics:
    if diacritic_value in [diacritic.value.decode("utf-8") for diacritic in ArabicDiacritics]:
        for diacritic in ArabicDiacritics:
            if diacritic.value.decode("utf-8") == diacritic_value:
                print(f"Extracted Diacritic: {diacritic.name}")
    else:
        print(f"Extracted Diacritic: No Diacritic")

الذمي أن يحتسب على المسلم
['', '', 'ِّ', 'ِّ', 'ِّ', '', 'َ', 'ْ', '', 'َ', 'ْ', 'َ', 'ِ', 'َ', '', 'َ', 'َ', '', '', '', 'ْ', 'ُ', 'ْ', 'ِ', '']
Extracted Diacritic: No Diacritic
Extracted Diacritic: No Diacritic
Extracted Diacritic: ARABIC_SHADDA_KASRA
Extracted Diacritic: ARABIC_SHADDA_KASRA
Extracted Diacritic: ARABIC_SHADDA_KASRA
Extracted Diacritic: No Diacritic
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: ARABIC_SUKUN
Extracted Diacritic: No Diacritic
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: ARABIC_SUKUN
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: ARABIC_KASRA
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: No Diacritic
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: No Diacritic
Extracted Diacritic: No Diacritic
Extracted Diacritic: No Diacritic
Extracted Diacritic: ARABIC_SUKUN
Extracted Diacritic: ARABIC_DAMMA
Extracted Diacritic: ARABIC_SUKUN
Extracted Diacritic: ARABIC_KASRA
Extracted Diacrit

### Output one hot encoding


In [3]:
def to_one_hot(ashkal, size):
    one_hot = []
    for diacritic in ashkal:
        coded = [0] * size
        if diacritic.encode('utf-8') in ArabicDiacritics_Mapping:
            coded[ArabicDiacritics_Mapping[diacritic.encode('utf-8')]] = 1
        one_hot.append(coded)
        
    return one_hot

### Model Structure

In [4]:
def create_model():
   arabic_chars = 36
   num_of_ashkaaal = 15
   max_word_length = 15

   SelectedLSTM = LSTM

   inputs = Input(shape=(max_word_length,))

   embeddings = Embedding(input_dim=arabic_chars, output_dim=36)(inputs)

   blstm1 = Bidirectional(SelectedLSTM(units=256, return_sequences=True))(embeddings)
   dropout1 = Dropout(0.5)(blstm1)

   blstm2 = Bidirectional(SelectedLSTM(units=256, return_sequences=True))(dropout1)
   dropout2 = Dropout(0.5)(blstm2)

   dense1 = TimeDistributed(Dense(units=512, activation='relu'))(dropout2)

   dense2 = TimeDistributed(Dense(units=512, activation='relu'))(dense1)

   output = TimeDistributed(Dense(units=num_of_ashkaaal, activation='softmax'))(dense2)

   model = Model(inputs, output)

   model.compile(loss='categorical_crossentropy', optimizer='adam')

   return model


### Model Training

In [5]:
class TrainModel:
    def __init__(self,X_train, y_train, epochs, batch_size):
        self.X_train = X_train
        self.y_train = y_train
        self.epochs = epochs
        self.batch_size = batch_size

    def train(self):
        model = create_model()
        predict = model.fit(self.X_train, self.y_train, epochs=self.epochs, batch_size=self.batch_size)
        model.summary()
        return model



### Prepare data for training

In [6]:
file_reader = FileReader()

#data = file_reader.open_file("train.txt")

process = Preprocessor()

#process.clean_data(data, "only_arabic.txt")

#only_arabic = file_reader.open_file("only_arabic.txt")

#process.remove_tarkeem(only_arabic, "no_tarkeem.txt")

no_tarkeem = file_reader.open_file("no_tarkeem.txt")

tokens = tokenize(no_tarkeem)

letters_tokens = []
diacritics_tokens = []
for token in tokens:
  letters, diacritic = process.separate_diacritics(token)
  letters_tokens.append(letters)
  diacritics_tokens.append(diacritic)


print(len(letters_tokens))
print(len(diacritics_tokens))


2102068
2102068


In [7]:
print(ArabicCharacters_Mapping)
print(ArabicDiacritics_Mapping)

output_hot_encoded = []
for ashkaal in diacritics_tokens[0:2]:
    coded = to_one_hot(ashkaal, 15)
    output_hot_encoded.append(coded)
    print(coded)
print(output_hot_encoded)

{b'\xd8\xa1': 0, b'\xd8\xa2': 1, b'\xd8\xa3': 2, b'\xd8\xa4': 3, b'\xd8\xa5': 4, b'\xd8\xa6': 5, b'\xd8\xa7': 6, b'\xd8\xa8': 7, b'\xd8\xa9': 8, b'\xd8\xaa': 9, b'\xd8\xab': 10, b'\xd8\xac': 11, b'\xd8\xad': 12, b'\xd8\xae': 13, b'\xd8\xaf': 14, b'\xd8\xb0': 15, b'\xd8\xb1': 16, b'\xd8\xb2': 17, b'\xd8\xb3': 18, b'\xd8\xb4': 19, b'\xd8\xb5': 20, b'\xd8\xb6': 21, b'\xd8\xb7': 22, b'\xd8\xb8': 23, b'\xd8\xb9': 24, b'\xd8\xba': 25, b'\xd9\x81': 26, b'\xd9\x82': 27, b'\xd9\x83': 28, b'\xd9\x84': 29, b'\xd9\x85': 30, b'\xd9\x86': 31, b'\xd9\x87': 32, b'\xd9\x88': 33, b'\xd9\x89': 34, b'\xd9\x8a': 35}
{b'\xd9\x91\xd9\x8b': 0, b'\xd9\x91\xd9\x8c': 1, b'\xd9\x91\xd9\x8d': 2, b'\xd9\x91\xd9\x8e': 3, b'\xd9\x91\xd9\x8f': 4, b'\xd9\x91\xd9\x90': 5, b'\xd9\x91\xd9\x92': 6, b'\xd9\x8b': 7, b'\xd9\x8c': 8, b'\xd9\x8d': 9, b'\xd9\x8e': 10, b'\xd9\x8f': 11, b'\xd9\x90': 12, b'\xd9\x91': 13, b'\xd9\x92': 14}
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 

### prepare X_train

In [8]:
sequences = []
for word in letters_tokens[:100000]:
  newWord = []
  for letter in word:
    newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
  sequences.append(newWord)

padded_input = pad_sequences(sequences, maxlen=15, padding='post', truncating='post', value=-1)
print(padded_input)

[[27 33 29 ... -1 -1 -1]
 [ 2 33 -1 ... -1 -1 -1]
 [27 22 24 ... -1 -1 -1]
 ...
 [16 12 30 ... -1 -1 -1]
 [ 6 29 29 ... -1 -1 -1]
 [ 9 24  6 ... -1 -1 -1]]


### prepare y_train

In [9]:
output_hot_encoded = []
for ashkaal in diacritics_tokens[:100000]:
    coded = to_one_hot(ashkaal, 15)
    output_hot_encoded.append(coded)

padded_output = pad_sequences(output_hot_encoded, maxlen=15, padding='post', truncating='post', value=[0] * 15)
print(padded_output)

[[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 ...

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 1 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]]


In [10]:
print(padded_input[:10],type(padded_input))
print(padded_output[:10],type(padded_output))

[[27 33 29 32 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 2 33 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [27 22 24 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 6 29  2 33 29 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [35 14 32 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 4 29 13 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [27  6 29 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 6 29 17 16 28 19 35 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 6  7 31 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [24 16 26  8 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]] <class 'numpy.ndarray'>
[[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 ...

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]

In [11]:

X_train = padded_input[:10000]

y_train = padded_output[:10000]



epochs = 50

batch_size = 100

train_model = TrainModel(X_train, y_train, epochs, batch_size)

trained_model = train_model.train()


2023-12-31 17:27:40.104257: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-31 17:27:40.134035: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-31 17:27:40.134693: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-31 17:27:40.140180: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-31 17:27:40.141005: I external/local_xla/xla/stream_executor

Epoch 1/50


2023-12-31 17:27:46.508636: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
2023-12-31 17:27:49.477329: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fc7f6645c10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-12-31 17:27:49.477376: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1660 Ti, Compute Capability 7.5
2023-12-31 17:27:49.483711: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1704036469.768265   41582 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 15)]              0         
                                                                 
 embedding (Embedding)       (None, 15, 36)            1296      
                                                                 
 bidirect

In [15]:
letters = "ذهب علي الى الشاطيء ثم لعب الكرة"
tokens = tokenize(letters)
sequences = []
for word in tokens:
  newWord = []
  for letter in word:
    newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
  sequences.append(newWord)

padded_input = pad_sequences(sequences, maxlen=15, padding='post', truncating='post', value=-1)

diacritics = trained_model.predict(padded_input)

results = ""

for i in range(0, len(tokens)):
    letter_list = tokens[i]
    diacritic_list = diacritics[i]
    for j in range(0, len(letter_list)):
        results += letter_list[j]
        index = np.argmax(diacritic_list[j])
        results += ArabicDiacritics_RevMapping[index].decode('utf-8')
    results += " "

print(results)

ذَهْبٍ عَلَيَّ اَلَّىْ اُلْشَّاْطِيِّءُ ثُمَّ لَعِبٍ اَلْكِرَّةِ 
