In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install pyarabic
# !pip install keras_preprocessing
import os
import random
from enum import Enum
import re
import numpy as np
from pyarabic.araby import separate, tokenize, is_arabicrange, strip_tashkeel, strip_tatweel
import nltk
from nltk.tokenize import sent_tokenize
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential,load_model, Model
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, CategoryEncoding, Bidirectional, Input, Concatenate, Dropout, TimeDistributed, RepeatVector, Flatten
from keras.initializers import glorot_normal
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import SparseCategoricalAccuracy, F1Score
from gensim.models import Word2Vec, FastText


# import sys
# sys.path.append('/content/drive/MyDrive/NLP_Project/')

from chars_enums import *
from file_reader import FileReader
from preprocessor import Preprocessor

### Train Word2Vec Model

In [None]:
def train_word2vec():
  file_reader = FileReader()
  process = Preprocessor()
  data = file_reader.open_file("train.txt")
  no_tarkeem_data = process.remove_tarkeem(data)
  Word2Vec_Train = []
  for line in no_tarkeem_data.split('\n'):
      line_no_diacritics = process.remove_diacritics(line)
      words = ['<SOS>']
      words.extend(tokenize(line_no_diacritics, conditions=is_arabicrange))
      words.append('<EOS>')
      Word2Vec_Train.append([word.ljust(15, ' ') for word in words])

  # Create the Word2Vec model, specifying skip-gram and window size
  model = Word2Vec(Word2Vec_Train, sg=1, window=5, vector_size=70)

  # Train the model (this might take time depending on dataset size)
  model.train(Word2Vec_Train, total_examples=model.corpus_count, epochs=10)

  return model


In [None]:
### UNCOMMENT THE NEXT LINES TO TRAIN THE WORD2VEC MODEL
#word2vec_model = train_word2vec()
#word2vec_model.save("/content/drive/MyDrive/NLP_Project/word2vec_model/my_word2vec.model")

In [None]:
word2vec_model = Word2Vec.load("word2vec_model/Word2Vec.model")
# word2vec_model = Word2Vec

### Model Structure

In [None]:
def create_model():
  with tf.device('/device:GPU:0'):
    arabic_chars = 37
    num_of_ashkaaal = 16
    max_word_length = 15

    SelectedLSTM = LSTM

    inputs = Input(shape=(max_word_length,))

    embeddings = Embedding(input_dim=arabic_chars, output_dim=37)(inputs)

    blstm1 = Bidirectional(SelectedLSTM(units=256, return_sequences=True))(embeddings)

    output = TimeDistributed(Dense(units=num_of_ashkaaal, activation='softmax'))(blstm1)

    model = Model(inputs, output)

    model.compile(optimizer='adam', loss=SparseCategoricalCrossentropy(), metrics=[SparseCategoricalAccuracy()])

    return model



def create_modified_model():
  with tf.device('/device:GPU:0'):
    word2vec_length = 50

    arabic_chars = 37
    num_of_ashkaaal = 16
    max_word_length = 15

    SelectedLSTM = LSTM

    input_left = Input(shape=(word2vec_length,))
    input_right = Input(shape=(word2vec_length,))
    input_word =  Input(shape=(max_word_length,))

    word_embedding = Embedding(input_dim=arabic_chars, output_dim=37)(input_word)

    repeated_word_LeftEmbedding = RepeatVector(max_word_length)((input_left))
    repeated_word_RightEmbedding = RepeatVector(max_word_length)((input_right))

    embeddings = Concatenate()([repeated_word_LeftEmbedding, repeated_word_RightEmbedding])

    blstm0 = Bidirectional(SelectedLSTM(units=128, return_sequences=True))(embeddings)

    concatenated_vector = Concatenate()([word_embedding, blstm0])

    blstm1 = Bidirectional(SelectedLSTM(units=256, return_sequences=True))(concatenated_vector)

    output = TimeDistributed(Dense(units=num_of_ashkaaal, activation='softmax'))(blstm1)

    model = Model([input_left, input_word, input_right], output)

    model.compile(optimizer='adam', loss=SparseCategoricalCrossentropy(), metrics=[SparseCategoricalAccuracy()])

    return model


In [None]:
model = create_modified_model()
model.summary()
del model

## Models Training

In [None]:
class UseModel:
    def __init__(self,X_train, y_train, epochs, batch_size):
        self.X_train = X_train
        self.y_train = y_train
        self.epochs = epochs
        self.batch_size = batch_size

    def train(self, model):
        history = model.fit(self.X_train, self.y_train, epochs=self.epochs, batch_size=self.batch_size, shuffle=True)#, sample_weight=1 - np.equal(self.X_train, 36))

    # def train_modified(self, model):
    #     history = model.fit(self.X_train, self.y_train, epochs=self.epochs, batch_size=self.batch_size, shuffle=True,
    #                         sample_weight = [np.ones((self.X_train[0].shape)),
    #                          (np.ones((self.X_train[1].shape)) - np.equal(self.X_train[1], 36)),
    #                                          np.ones((self.X_train[2].shape))])

    def train_modified(self, model):
        history = model.fit(self.X_train, self.y_train, epochs=self.epochs, batch_size=self.batch_size, shuffle=True)

    def evaluate(self,model):
        results = model.evaluate(self.X_train, self.y_train)
        model.summary()
        print("Evaluation Results:", results[1]*100)



## Prepare data utilities


In [None]:
def to_one_hot(ashkal, size):
    one_hot = []
    for diacritic in ashkal:
        coded = [0] * size
        if diacritic.encode('utf-8') in ArabicDiacritics_Mapping:
            coded[ArabicDiacritics_Mapping[diacritic.encode('utf-8')]] = 1
        one_hot.append(coded)

    return one_hot

def prepare_data(file_name):
  file_reader = FileReader()
  process = Preprocessor()
  data = file_reader.open_file(file_name+".txt")
  cleaned_data = process.clean_data(data)
  no_tarkeem = process.remove_tarkeem(cleaned_data)
  process.tokenize_data(no_tarkeem, file_name + "_words.pickle", file_name +"_diacritics.pickle")
  del cleaned_data
  del no_tarkeem

def prepare_modified_data(file_name):
  file_reader = FileReader()
  process = Preprocessor()
  data = file_reader.open_file(file_name+".txt")
  no_tarkeem_data = process.remove_tarkeem(data)
  all_cleaned_data = []
  for line in no_tarkeem_data.split('\n'):
    cleaned_data = ["<SOS>"]
    cleaned_data.extend(tokenize(line, conditions=is_arabicrange))
    cleaned_data.append("<EOS>")
    all_cleaned_data.extend(cleaned_data)
  process.tokenize_data(" ".join(all_cleaned_data), file_name + "_words_mod.pickle", file_name +"_diacritics_mod.pickle")
  del cleaned_data
  del all_cleaned_data
  del no_tarkeem_data

def pad_input(lett):
  sequences = []
  for word in letters_tokens:
    if word == "<SOS>" or word == "<EOS>":
      sequences.append(word)
      continue
    newWord = []
    for letter in word:
      newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
    sequences.append(newWord)

  padded_input = pad_sequences(sequences, maxlen=15, padding='post', truncating='post', value=36)
  del sequences
  return padded_input


def pad_output(diacritics_tokens):
  output_hot_encoded = []
  for ashkaal in diacritics_tokens:
      coded = []
      for shakl in ashkaal:
        coded.append(ArabicDiacritics_Mapping[shakl.encode('utf-8')])
      output_hot_encoded.append(coded)

  padded_output = pad_sequences(output_hot_encoded, maxlen=15, padding='post', truncating='post', value=15)
  del output_hot_encoded
  return padded_output



## prepare train data

In [None]:
#prepare_data("train")
process = Preprocessor()
letters_tokens, diacritics_tokens = process.read_tokenized_data("train_words.pickle", "train_diacritics.pickle")
print(len(letters_tokens))
print(len(diacritics_tokens))

padded_input = pad_input(letters_tokens)
padded_output = pad_output(diacritics_tokens)

## Train old Model

In [None]:
import gc
gc.collect()
with tf.device('/device:GPU:0'):
    X_train = padded_input
    y_train = padded_output
    epochs = 10
    batch_size = 1000
    model = create_model()
    train_model = UseModel(X_train, y_train, epochs, batch_size)
    train_model.train(model)

## save the old model

In [None]:
model.save("Models/Model.h5", save_format='h5')

## Training Modified Model

In [None]:
import gc
gc.collect()
with tf.device('/device:GPU:0'):

    X_train_L = []
    X_train_W = []
    X_train_R = []
    for i in range (1,len(padded_input)-1):
      X_train_L.append(word2vec_model.wv[letters_tokens[i+1]] if letters_tokens[i+1] in word2vec_model.wv else word2vec_model.wv[' '])
      X_train_W.append(padded_input[i])
      X_train_R.append(word2vec_model.wv[letters_tokens[i-1]] if letters_tokens[i-1] in word2vec_model.wv else word2vec_model.wv[' '])


    X_train_L = np.array(X_train_L)
    X_train_W = np.array(X_train_W)
    X_train_R = np.array(X_train_R)

    y_train = padded_output[1:-1]

    epochs = 10

    batch_size = 1000

    model = create_modified_model()
    model.summary()

    train_model = UseModel([X_train_L, X_train_W, X_train_R], y_train, epochs, batch_size)

    train_model.train_modified(model)



## Prepare validation data

In [None]:
#prepare_data("val")
process = Preprocessor()
letters_tokens, diacritics_tokens = process.read_tokenized_data("val_words.pickle", "val_diacritics.pickle")
print(len(letters_tokens))
print(len(diacritics_tokens))

padded_input = pad_input(letters_tokens)
padded_output = pad_output(diacritics_tokens)

## Evaluate Modified model

In [None]:
with tf.device('/device:GPU:0'):
    X_train = padded_input

    y_train = padded_output

    X_train_L = []
    X_train_W = []
    X_train_R = []
    for i in range (1,len(padded_input)-1):
      X_train_L.append(word2vec_model.wv[letters_tokens[i+1]] if letters_tokens[i+1] in word2vec_model.wv else word2vec_model.wv[' '])
      X_train_W.append(padded_input[i])
      X_train_R.append(word2vec_model.wv[letters_tokens[i-1]] if letters_tokens[i-1] in word2vec_model.wv else word2vec_model.wv[' '])


    X_train_L = np.array(X_train_L)
    X_train_W = np.array(X_train_W)
    X_train_R = np.array(X_train_R)

    y_train = padded_output[1:-1]

    epochs = 10

    batch_size = 1000

    model_path = "Models/Modified_Model.h5"
    model = load_model(model_path)
    evaluate_model = UseModel([X_train_L, X_train_W, X_train_R], y_train, epochs, batch_size)
    evaluate_model.evaluate(model)

## Old Model Evaluation


In [None]:
import gc
gc.collect()
with tf.device('/device:GPU:0'):
    X_train = padded_input

    y_train = padded_output

    epochs = 10

    batch_size = 1000

    model_path = "Models/Model.h5"
    model = load_model(model_path)
    train_model = UseModel(X_train, y_train, epochs, batch_size)

    train_model.evaluate(model)


## Test Old model on one line

In [None]:
letters = " فالخيل والليل والبيداء تعرفني والسيف والرمح والقرطاس والقلم"
tokens = tokenize(letters)

results = ""
model_path = "Models/Model.h5"
model = load_model(model_path)
for word in tokens:
  if is_arabicrange(word):
    newWord = []
    for letter in word:
      newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
    padded_input = pad_sequences([newWord], maxlen=15, padding='post', truncating='post', value=36)
    diacritics = model.predict(padded_input)
    for j in range(0, len(word)):
        results += word[j]
        index = np.argmax(diacritics[0][j])
        results += ArabicDiacritics_RevMapping[index].decode('utf-8')
  else:
    results += word
  results += " "

print(results)

## Test Modified Model

In [None]:
letters = "قال احمد الحق"
tokens = tokenize(letters)

results = ""


model_path = "Models/Modified_Model.h5"
model = load_model(model_path)
for i,word in enumerate(tokens):
  if is_arabicrange(word):
    newWord = []
    for letter in word:
      newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
    padded_input = pad_sequences([newWord], maxlen=15, padding='post', truncating='post', value=36)

    X_train_L = [word2vec_model.wv[letters_tokens[i+1]] if i+1 < len(tokens) and tokens[i+1] in word2vec_model.wv else word2vec_model.wv[' ']]
    X_train_W = padded_input
    X_train_R = [word2vec_model.wv[letters_tokens[i-1]] if i-1 > 0 and tokens[i-1] in word2vec_model.wv else word2vec_model.wv[' ']]

    X_train_L = np.array(X_train_L)
    X_train_W = np.array(X_train_W)
    X_train_R = np.array(X_train_R)

    diacritics = model.predict([X_train_L, X_train_W, X_train_R])
    for j in range(0, len(word)):
        results += word[j]
        index = np.argmax(diacritics[0][j])
        results += ArabicDiacritics_RevMapping[index].decode('utf-8')
  else:
    results += word
  results += " "

print(results)

# uplaod the data in csv seciton

In [None]:
import pickle


file = open("dataset/diacritic2id.pickle","rb")

dic = pickle.load(file)

print(dic)

In [None]:
import csv

process = Preprocessor()
arabicwords = []

model_path = "Models/Model.h5"
model = load_model(model_path)

with open("outputs_Model_1.csv", 'w') as csvfile:
  csvwriter = csv.writer(csvfile)

  letters = open("dataset/test_no_diacritics.txt", "r").readlines()
  tokens = tokenize(process.remove_tarkeem(" ".join(letters)))

  i = 0

  padded_inputs = []

  for word in tokens:
    if is_arabicrange(word):
      newWord = []
      for letter in word:
        newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
      arabicwords.append(newWord)
      padded_input = pad_sequences([newWord], maxlen=15, padding='post', truncating='post', value=36)
      padded_inputs.append(padded_input)

  padded_inputs = np.array(padded_inputs).reshape(-1, 15)

  diacritics = model.predict(padded_inputs)

In [None]:
index = 0
output = []

with open("outputs_Model_1.csv", 'w') as csvfile:
  for i in range(0, len(arabicwords)):
    for j in range(0, len(arabicwords[i])):
      arindex = np.argmax(diacritics[i][j])
      output.append(str(index) + "," +  str(dic[ArabicDiacritics_RevMapping[arindex].decode('utf-8')]) + "\n")
      index += 1

  csvfile.writelines(["ID,label\n"])
  csvfile.writelines(output)

In [None]:
import csv
import pandas

process = Preprocessor()
arabicwords = []

model_path = "Models/Modified_Model.h5"
model = load_model(model_path)

with open("outputs_Model_2.csv", 'w') as csvfile:
  csvwriter = csv.writer(csvfile)

  letters = open("dataset/test_no_diacritics.txt", "r").readlines()
  tokens = tokenize(process.remove_tarkeem(" ".join(letters)))

  i = 0

X_train_L = []
X_train_W = []
X_train_R = []

for word in tokens:
    if is_arabicrange(word):
      newWord = []
      for letter in word:
        newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
      arabicwords.append(newWord)
      padded_input = pad_sequences([newWord], maxlen=15, padding='post', truncating='post', value=36)

      X_train_L_new = [word2vec_model.wv[tokens[i+1]] if i+1 < len(tokens) and tokens[i+1] in word2vec_model.wv else word2vec_model.wv[' ']]
      X_train_W_new = padded_input
      X_train_R_new = [word2vec_model.wv[tokens[i-1]] if i-1 > 0 and tokens[i-1] in word2vec_model.wv else word2vec_model.wv[' ']]

      X_train_L.append(X_train_L_new)
      X_train_W.append(X_train_W_new)
      X_train_R.append(X_train_R_new)

X_train_L = np.array(X_train_L)
X_train_W = np.array(X_train_W)
X_train_R = np.array(X_train_R)

X_train_L = (X_train_L).reshape((-1, 50))
X_train_W = (X_train_W).reshape((-1, 15))
X_train_R = (X_train_R).reshape((-1, 50))

diacritics = model.predict([X_train_L, X_train_W, X_train_R])



In [None]:
index = 0
output = []

with open("outputs_Model_2.csv", 'w') as csvfile:
  for i in range(0, len(arabicwords)):
    for j in range(0, len(arabicwords[i])):
      arindex = np.argmax(diacritics[i][j])
      output.append(str(index) + "," +  str(dic[ArabicDiacritics_RevMapping[arindex].decode('utf-8')]) + "\n")
      index += 1

  csvfile.writelines(["ID,label\n"])
  csvfile.writelines(output)