In [None]:
from keras.models import Model, load_model
from keras_preprocessing.sequence import pad_sequences
from pyarabic.araby import tokenize, is_arabicrange
from gensim.models import Word2Vec
import numpy as np
import pickle

from file_reader import FileReader
from preprocessor import Preprocessor
from chars_enums import *

In [None]:
process = Preprocessor()
diacritics_dict = pickle.load(open("dataset/diacritic2id.pickle","rb"))

word2vec_model = Word2Vec.load("word2vec_model/my_word2vec.model")

In [None]:
def pad_input(letters_tokens):
  sequences = []
  for word in letters_tokens:
    newWord = []
    for letter in word:
      newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
    sequences.append(newWord)

  padded_input = pad_sequences(sequences, maxlen=15, padding='post', truncating='post', value=36)
  del sequences
  del newWord
  return padded_input

def pad_output(diacritics_tokens):
  output_encoded = []
  for ashkaal in diacritics_tokens:
      coded = []
      for shakl in ashkaal:
        coded.append(ArabicDiacritics_Mapping[shakl.encode('utf-8')])
      output_encoded.append(coded)

  padded_output = pad_sequences(output_encoded, maxlen=15, padding='post', truncating='post', value=15)
  del output_encoded
  del coded
  return padded_output

In [None]:
model_path = "Models/Model.h5"
model = load_model(model_path)
model.summary()

letters_tokens, diacritics_tokens = process.read_tokenized_data("val_words.pickle", "val_diacritics.pickle")

padded_input = pad_input(letters_tokens)
padded_output = pad_output(diacritics_tokens)

X_train= padded_input
y_train = padded_output

model.evaluate(X_train, y_train)

In [None]:
model_path = "Models/Modified_Model.h5"
model = load_model(model_path)
model.summary()

letters_tokens, diacritics_tokens = process.read_tokenized_data("val_words.pickle", "val_diacritics.pickle")

padded_input = pad_input(letters_tokens)
padded_output = pad_output(diacritics_tokens)

X_train_L = []
X_train_W = []
X_train_R = []

for i in range (0,len(padded_input)):
  X_train_L_new = [word2vec_model.wv[letters_tokens[i+1]] if i+1 < len(letters_tokens) and letters_tokens[i+1] in word2vec_model.wv else word2vec_model.wv[' ']]
  X_train_W_new = padded_input[i]
  X_train_R_new = [word2vec_model.wv[letters_tokens[i-1]] if i-1 > 0 and letters_tokens[i-1] in word2vec_model.wv else word2vec_model.wv[' ']]

  X_train_L.append(X_train_L_new)
  X_train_W.append(X_train_W_new)
  X_train_R.append(X_train_R_new)
  
X_train_L = np.array(X_train_L)
X_train_W = np.array(X_train_W)
X_train_R = np.array(X_train_R)

X_train_L = (X_train_L).reshape((-1, 50))
X_train_W = (X_train_W).reshape((-1, 15))
X_train_R = (X_train_R).reshape((-1, 50))

y_train = padded_output

diacritics = model.evaluate([X_train_L, X_train_W, X_train_R], y_train)

In [None]:
model_path = "Models/Model.h5"
model = load_model(model_path)
model.summary()

letters = open("dataset/test_no_diacritics.txt", "r").readlines()
tokens = tokenize(process.remove_tarkeem(" ".join(letters)))

arabicwords = []
padded_inputs = []

for word in tokens:
    if is_arabicrange(word):
      newWord = []
      newLetters = []
      for letter in word:
        newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
        newLetters.append(letter)
      arabicwords.append(newLetters)
      padded_input = pad_sequences([newWord], maxlen=15, padding='post', truncating='post', value=36)
      padded_inputs.append(padded_input)

padded_inputs = np.array(padded_inputs)

padded_inputs = (padded_inputs).reshape((-1, 15))

diacritics = model.predict(padded_inputs)

index = 0
output = ["ID,label\n"]

with open("outputs_Model_1.csv", 'w') as csvfile:
  for i in range(0, len(arabicwords)):
    for j in range(0, len(arabicwords[i])):
      di_index = np.argmax(diacritics[i][j])
      output.append(str(index) + "," +  str(diacritics_dict[ArabicDiacritics_RevMapping[di_index].decode('utf-8')]) + "\n")
      index += 1
      
  csvfile.writelines(output)

In [None]:
model_path = "Models/Modified_Model.h5"
model = load_model(model_path)
model.summary()

letters = open("dataset/test_no_diacritics.txt", "r").readlines()
tokens = tokenize(process.remove_tarkeem(" ".join(letters)))

arabicwords = []
X_train_L = []
X_train_W = []
X_train_R = []

for i,word in enumerate(tokens):
    if is_arabicrange(word):
      newWord = []
      newLetters = []
      for letter in word:
        newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
        newLetters.append(letter)
      arabicwords.append(newLetters)
      padded_input = pad_sequences([newWord], maxlen=15, padding='post', truncating='post', value=36)

      X_train_L_new = [word2vec_model.wv[tokens[i+1]] if i+1 < len(tokens) and tokens[i+1] in word2vec_model.wv else word2vec_model.wv[' ']]
      X_train_W_new = padded_input
      X_train_R_new = [word2vec_model.wv[tokens[i-1]] if i-1 > 0 and tokens[i-1] in word2vec_model.wv else word2vec_model.wv[' ']]

      X_train_L.append(X_train_L_new)
      X_train_W.append(X_train_W_new)
      X_train_R.append(X_train_R_new)

X_train_L = np.array(X_train_L)
X_train_W = np.array(X_train_W)
X_train_R = np.array(X_train_R)

X_train_L = (X_train_L).reshape((-1, 50))
X_train_W = (X_train_W).reshape((-1, 15))
X_train_R = (X_train_R).reshape((-1, 50))

diacritics = model.predict([X_train_L, X_train_W, X_train_R])

index = 0
output = ["ID,label\n"]

with open("outputs_Model_2.csv", 'w') as csvfile:
  for i in range(0, len(arabicwords)):
    for j in range(0, len(arabicwords[i])):
      di_index = np.argmax(diacritics[i][j])
      output.append(str(index) + "," +  str(diacritics_dict[ArabicDiacritics_RevMapping[di_index].decode('utf-8')]) + "\n")
      index += 1
      
  csvfile.writelines(output)

In [None]:
model_path = "Models/Model.h5"
model = load_model(model_path)
model.summary()

letters = ["قال احمد الحق"]
tokens = tokenize(process.remove_tarkeem(" ".join(letters)))

arabicwords = []
padded_inputs = []

for word in tokens:
    if is_arabicrange(word):
      newWord = []
      newLetters = []
      for letter in word:
        newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
        newLetters.append(letter)
      arabicwords.append(newLetters)
      padded_input = pad_sequences([newWord], maxlen=15, padding='post', truncating='post', value=36)
      padded_inputs.append(padded_input)

padded_inputs = np.array(padded_inputs)

padded_inputs = (padded_inputs).reshape((-1, 15))

diacritics = model.predict(padded_inputs)

results = ""

for i in range(0, len(arabicwords)):
  for j in range(0, len(arabicwords[i])):
      di_index = np.argmax(diacritics[i][j])
      results += arabicwords[i][j]
      results += ArabicDiacritics_RevMapping[di_index].decode('utf-8')
  results += " "
  
print(results)

In [None]:
model_path = "Models/Modified_Model.h5"
model = load_model(model_path)
model.summary()

letters = ["قال احمد الحق"]
tokens = tokenize(process.remove_tarkeem(" ".join(letters)))

arabicwords = []
X_train_L = []
X_train_W = []
X_train_R = []

for i,word in enumerate(tokens):
    if is_arabicrange(word):
      newWord = []
      newLetters = []
      for letter in word:
        newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
        newLetters.append(letter)
      arabicwords.append(newLetters)
      padded_input = pad_sequences([newWord], maxlen=15, padding='post', truncating='post', value=36)

      X_train_L_new = [word2vec_model.wv[tokens[i+1]] if i+1 < len(tokens) and tokens[i+1] in word2vec_model.wv else word2vec_model.wv[' ']]
      X_train_W_new = padded_input
      X_train_R_new = [word2vec_model.wv[tokens[i-1]] if i-1 > 0 and tokens[i-1] in word2vec_model.wv else word2vec_model.wv[' ']]

      X_train_L.append(X_train_L_new)
      X_train_W.append(X_train_W_new)
      X_train_R.append(X_train_R_new)

X_train_L = np.array(X_train_L)
X_train_W = np.array(X_train_W)
X_train_R = np.array(X_train_R)

X_train_L = (X_train_L).reshape((-1, 50))
X_train_W = (X_train_W).reshape((-1, 15))
X_train_R = (X_train_R).reshape((-1, 50))

diacritics = model.predict([X_train_L, X_train_W, X_train_R])

results = ""

for i in range(0, len(arabicwords)):
  for j in range(0, len(arabicwords[i])):
      di_index = np.argmax(diacritics[i][j])
      results += arabicwords[i][j]
      results += ArabicDiacritics_RevMapping[di_index].decode('utf-8')
  results += " "

print(results)