In [1]:
!pip install pyarabic
!pip install keras_preprocessing
import os
from enum import Enum
import re
import numpy as np
from pyarabic.araby import separate, tokenize, is_arabicrange, strip_tashkeel, strip_tatweel
import nltk
from nltk.tokenize import sent_tokenize
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential,load_model, Model
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, CategoryEncoding, Bidirectional, Input, Dropout, TimeDistributed
from keras.initializers import glorot_normal
from gensim.models import Word2Vec

import sys
sys.path.append('/content/drive/MyDrive/NLP_Project/')

from chars_enums import *
from file_reader import FileReader
from preprocessor import Preprocessor



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
P1 = Preprocessor()
arabic_text = "الذِّمِّيِّ أَنْ يَحْتَسِبَ عَلَى الْمُسْلِم"
result, diacritics = P1.separate_diacritics(arabic_text)
print(result)
print(diacritics)

for diacritic_value in diacritics:
    if diacritic_value in [diacritic.value.decode("utf-8") for diacritic in ArabicDiacritics]:
        for diacritic in ArabicDiacritics:
            if diacritic.value.decode("utf-8") == diacritic_value:
                print(f"Extracted Diacritic: {diacritic.name}")
    else:
        print(f"Extracted Diacritic: No Diacritic")

الذمي أن يحتسب على المسلم
['', '', 'ِّ', 'ِّ', 'ِّ', '', 'َ', 'ْ', '', 'َ', 'ْ', 'َ', 'ِ', 'َ', '', 'َ', 'َ', '', '', '', 'ْ', 'ُ', 'ْ', 'ِ', '']
Extracted Diacritic: No Diacritic
Extracted Diacritic: No Diacritic
Extracted Diacritic: ARABIC_SHADDA_KASRA
Extracted Diacritic: ARABIC_SHADDA_KASRA
Extracted Diacritic: ARABIC_SHADDA_KASRA
Extracted Diacritic: No Diacritic
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: ARABIC_SUKUN
Extracted Diacritic: No Diacritic
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: ARABIC_SUKUN
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: ARABIC_KASRA
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: No Diacritic
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: ARABIC_FATHA
Extracted Diacritic: No Diacritic
Extracted Diacritic: No Diacritic
Extracted Diacritic: No Diacritic
Extracted Diacritic: ARABIC_SUKUN
Extracted Diacritic: ARABIC_DAMMA
Extracted Diacritic: ARABIC_SUKUN
Extracted Diacritic: ARABIC_KASRA
Extracted Diacrit

### Output one hot encoding


In [58]:
def to_one_hot(shakl, size):
    one_hot = np.zeros([size])
    for i, diacritic in enumerate(ArabicDiacritics):
        if shakl == diacritic.value.decode("utf-8"):
            one_hot[i] = 1
    return one_hot

### Model Structure

In [9]:
def create_model():
   arabic_chars = 36
   num_of_ashkaaal = 15
   max_word_length = 15

   inputs = Input(shape=(max_word_length,))

   embeddings = Embedding(input_dim=arabic_chars, output_dim=36)(inputs)

   blstm1 = Bidirectional(LSTM(units=256, return_sequences=True))(embeddings)
   dropout1 = Dropout(0.5)(blstm1)

   blstm2 = Bidirectional(LSTM(units=256, return_sequences=True))(dropout1)
   dropout2 = Dropout(0.5)(blstm2)

   dense1 = TimeDistributed(Dense(units=512, activation='relu'))(dropout2)

   dense2 = TimeDistributed(Dense(units=512, activation='relu'))(dense1)

   output = TimeDistributed(Dense(units=num_of_ashkaaal, activation='softmax'))(dense2)

   model = Model(inputs, output)

   model.compile(loss='categorical_crossentropy', optimizer='adam')

   return model


### Model Training

In [10]:
class TrainModel:
    def __init__(self,X_train, y_train, epochs, batch_size):
        self.X_train = X_train
        self.y_train = y_train
        self.epochs = epochs
        self.batch_size = batch_size

    def train(self):
        model = create_model()
        predict = model.fit(self.X_train, self.y_train, epochs=self.epochs, batch_size=self.batch_size)
        model.summary()
        return predict



### Prepare data for training

In [17]:
file_reader = FileReader()

#data = file_reader.open_file("train.txt")

process = Preprocessor()

#process.clean_data(data, "only_arabic.txt")

#only_arabic = file_reader.open_file("only_arabic.txt")

#process.remove_tarkeem(only_arabic, "no_tarkeem.txt")

no_tarkeem = file_reader.open_file("no_tarkeem.txt")

tokens = tokenize(no_tarkeem)

letters_tokens = []
diacritics_tokens = []
for token in tokens:
  letters, diacritic = process.separate_diacritics(token)
  letters_tokens.append(letters)
  diacritics_tokens.append(diacritic)


print(len(letters_tokens))
print(len(diacritics_tokens))


2102068
2102068


### prepare X_train

In [29]:
sequences = []
for word in letters_tokens:
  newWord = []
  for letter in word:
      for i,c in enumerate(ArabicCharacters):
          if (letter == c.value.decode("utf-8")):
              newWord.append(i)
  sequences.append(newWord)

padded_input = pad_sequences(sequences, maxlen=15, padding='post', truncating='post', value=-1)
print(padded_input)

[[27 33 29 ... -1 -1 -1]
 [ 2 33 -1 ... -1 -1 -1]
 [27 22 24 ... -1 -1 -1]
 ...
 [35  3 18 ... -1 -1 -1]
 [27  7 29 ... -1 -1 -1]
 [30 33  9 ... -1 -1 -1]]


### prepare y_train

In [59]:
output_hot_encoded = []
for ashkaal in diacritics_tokens[:100000]:
    coded = np.array([])
    for shakl in ashkaal:
        coded = np.append(coded, to_one_hot(shakl, 15))
    output_hot_encoded.append(coded)

# Convert the outer list to a NumPy array
output_hot_encoded = np.array(output_hot_encoded)



  output_hot_encoded = np.array(output_hot_encoded)


In [60]:
print(padded_input[:10],type(padded_input))
print(output_hot_encoded[:10],type(output_hot_encoded))

[[27 33 29 32 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 2 33 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [27 22 24 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 6 29  2 33 29 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [35 14 32 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 4 29 13 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [27  6 29 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 6 29 17 16 28 19 35 -1 -1 -1 -1 -1 -1 -1 -1]
 [ 6  7 31 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [24 16 26  8 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]] <class 'numpy.ndarray'>
[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0.])
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0

In [53]:

X_train = padded_input[:1000]

y_train = np.array(output_hot_encoded[:1000])



epochs = 50

batch_size = 100

train_model = TrainModel(X_train, y_train, epochs, batch_size)

trained_model = train_model.train()


ValueError: ignored