In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install pyarabic
# !pip install keras_preprocessing
import os
import random
from enum import Enum
import re
import numpy as np
from pyarabic.araby import separate, tokenize, is_arabicrange, strip_tashkeel, strip_tatweel
import nltk
from nltk.tokenize import sent_tokenize
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential,load_model, Model
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, CategoryEncoding, Bidirectional, Input, Dropout, TimeDistributed
from keras.initializers import glorot_normal
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import SparseCategoricalAccuracy, F1Score
from gensim.models import Word2Vec


# import sys
# sys.path.append('/content/drive/MyDrive/NLP_Project/')

from chars_enums import *
from file_reader import FileReader
from preprocessor import Preprocessor

### Model Structure

In [None]:
def create_model():
  with tf.device('/device:GPU:0'):
    arabic_chars = 37
    num_of_ashkaaal = 16
    max_word_length = 15

    SelectedLSTM = LSTM

    inputs = Input(shape=(max_word_length,))

    embeddings = Embedding(input_dim=arabic_chars, output_dim=37)(inputs)

    blstm1 = Bidirectional(SelectedLSTM(units=256, return_sequences=True))(embeddings)

    output = TimeDistributed(Dense(units=num_of_ashkaaal, activation='softmax'))(blstm1)

    model = Model(inputs, output)

    model.compile(optimizer='adam', loss=SparseCategoricalCrossentropy(), metrics=[SparseCategoricalAccuracy()])

    return model


### Model Training

In [None]:
class UseModel:
    def __init__(self,X_train, y_train, epochs, batch_size):
        self.X_train = X_train
        self.y_train = y_train
        self.epochs = epochs
        self.batch_size = batch_size

    def train(self, model):
        history = model.fit(self.X_train, self.y_train, epochs=self.epochs, batch_size=self.batch_size, shuffle=True, sample_weight=1 - np.equal(X_train, 36))

    def evaluate(self,model):
        results = model.evaluate(self.X_train, self.y_train)
        model.summary()
        print("Evaluation Results:", results[1]*100)



## Prepare data utility


In [None]:
def to_one_hot(ashkal, size):
    one_hot = []
    for diacritic in ashkal:
        coded = [0] * size
        if diacritic.encode('utf-8') in ArabicDiacritics_Mapping:
            coded[ArabicDiacritics_Mapping[diacritic.encode('utf-8')]] = 1
        one_hot.append(coded)

    return one_hot

def prepare_data(file_name):
  file_reader = FileReader()
  process = Preprocessor()
  data = file_reader.open_file(file_name+".txt")
  cleaned_data = process.clean_data(data)
  no_tarkeem = process.remove_tarkeem(cleaned_data)
  process.tokenize_data(no_tarkeem, file_name + "_words.pickle", file_name +"_diacritics.pickle")
  del cleaned_data
  del no_tarkeem


def pad_input(lett):
  sequences = []
  for word in letters_tokens:
    newWord = []
    for letter in word:
      newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
    sequences.append(newWord)

  padded_input = pad_sequences(sequences, maxlen=15, padding='post', truncating='post', value=36)
  del sequences
  return padded_input


def pad_output(diacritics_tokens):
  output_hot_encoded = []
  for ashkaal in diacritics_tokens:
      coded = []
      for shakl in ashkaal:
        coded.append(ArabicDiacritics_Mapping[shakl.encode('utf-8')])
      output_hot_encoded.append(coded)

  padded_output = pad_sequences(output_hot_encoded, maxlen=15, padding='post', truncating='post', value=15)
  del output_hot_encoded
  return padded_output



## prepare train data

In [None]:
#prepare_data("train")
process = Preprocessor()
letters_tokens, diacritics_tokens = process.read_tokenized_data("train_words.pickle", "train_diacritics.pickle")
print(len(letters_tokens))
print(len(diacritics_tokens))

padded_input = pad_input(letters_tokens)
padded_output = pad_output(diacritics_tokens)

## Training the Model

In [None]:
import gc
gc.collect()
with tf.device('/device:GPU:0'):
    X_train = padded_input

    y_train = padded_output

    epochs = 8

    batch_size = 1000

    model = create_model()
    train_model = UseModel(X_train, y_train, epochs, batch_size)

    train_model.train(model)


In [None]:
#del model

## Prepare validation data

In [None]:
#prepare_data("val")
process = Preprocessor()
letters_tokens, diacritics_tokens = process.read_tokenized_data("val_words.pickle", "val_diacritics.pickle")
print(len(letters_tokens))
print(len(diacritics_tokens))

padded_input = pad_input(letters_tokens)
padded_output = pad_output(diacritics_tokens)

In [None]:
with tf.device('/device:GPU:0'):
    X_train = padded_input

    y_train = padded_output


    epochs = 10

    batch_size = 1000

    evaluate_model = UseModel(X_train, y_train, epochs, batch_size)

    evaluate_model.evaluate(model)

In [None]:
letters = " فالخيل والليل والبيداء تعرفني والسيف والرمح والقرطاس والقلم"
tokens = tokenize(letters)

results = ""

for word in tokens:
  if is_arabicrange(word):
    newWord = []
    for letter in word:
      newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
    padded_input = pad_sequences([newWord], maxlen=15, padding='post', truncating='post', value=36)
    diacritics = model.predict(padded_input)
    for j in range(0, len(word)):
        results += word[j]
        index = np.argmax(diacritics[0][j])
        results += ArabicDiacritics_RevMapping[index].decode('utf-8')
  else:
    results += word
  results += " "

print(results)