In [1]:
!pip install pyarabic
!pip install keras_preprocessing
import os
import random
from enum import Enum
import re
import numpy as np
from pyarabic.araby import separate, tokenize, is_arabicrange, strip_tashkeel, strip_tatweel
import nltk
from nltk.tokenize import sent_tokenize
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential,load_model, Model
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, CategoryEncoding, Bidirectional, Input, Dropout, TimeDistributed
from keras.initializers import glorot_normal
from gensim.models import Word2Vec


import sys
sys.path.append('/content/drive/MyDrive/NLP_Project/')

from chars_enums import *
from file_reader import FileReader
from preprocessor import Preprocessor



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Model Structure

In [3]:
def create_model():
  with tf.device('/device:GPU:0'):
    arabic_chars = 36
    num_of_ashkaaal = 16
    max_word_length = 15

    SelectedLSTM = LSTM

    inputs = Input(shape=(max_word_length,))

    embeddings = Embedding(input_dim=arabic_chars, output_dim=36)(inputs)

    blstm1 = Bidirectional(SelectedLSTM(units=64, return_sequences=True))(embeddings)
    dropout1 = Dropout(0.5)(blstm1)

    blstm2 = Bidirectional(SelectedLSTM(units=128, return_sequences=True))(dropout1)
    dropout2 = Dropout(0.5)(blstm2)

    dense1 = TimeDistributed(Dense(units=128, activation='relu'))(dropout2)

    dense2 = TimeDistributed(Dense(units=64, activation='relu'))(dense1)

    output = TimeDistributed(Dense(units=num_of_ashkaaal, activation='softmax'))(dense2)

    model = Model(inputs, output)

    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model


### Model Training

In [4]:
class TrainModel:
    def __init__(self,X_train, y_train, epochs, batch_size):
        self.X_train = X_train
        self.y_train = y_train
        self.epochs = epochs
        self.batch_size = batch_size

    def train(self):
        model = create_model()

        model.fit(self.X_train, self.y_train, epochs=self.epochs, batch_size=self.batch_size, shuffle=True)
        model.summary()
        return model



### Prepare data for training

In [6]:
### UNCOMMENT NEXT LINE TO REGENERATE PICKLES
file_reader = FileReader()
process = Preprocessor()
data = file_reader.open_file("train.txt")
cleaned_data = process.clean_data(data)
no_tarkeem = process.remove_tarkeem(cleaned_data)
process.tokenize_data(no_tarkeem, "words.pickle", "diacritics.pickle")

In [7]:
letters_tokens, diacritics_tokens = process.read_tokenized_data("words.pickle", "diacritics.pickle")
print(len(letters_tokens))
print(len(diacritics_tokens))

2102068
2102068


### prepare X_train

In [8]:
sequences = []
for word in letters_tokens:
  newWord = []
  for letter in word:
    newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
  sequences.append(newWord)

padded_input = pad_sequences(sequences, maxlen=15, padding='post', truncating='post', value=-1)

### Output one hot encoding


In [9]:
def to_one_hot(ashkal, size):
    one_hot = []
    for diacritic in ashkal:
        coded = [0] * size
        if diacritic.encode('utf-8') in ArabicDiacritics_Mapping:
            coded[ArabicDiacritics_Mapping[diacritic.encode('utf-8')]] = 1
        one_hot.append(coded)

    return one_hot

### prepare y_train

In [10]:
output_hot_encoded = []
for ashkaal in diacritics_tokens:
    coded = to_one_hot(ashkaal, 16)
    output_hot_encoded.append(coded)

padded_output = pad_sequences(output_hot_encoded, maxlen=15, padding='post', truncating='post', value=[0] * 16)

In [11]:
with tf.device('/device:GPU:0'):
    X_train = padded_input

    y_train = padded_output



    epochs = 10

    batch_size = 1000


    train_model = TrainModel(X_train, y_train, epochs, batch_size)

    trained_model = train_model.train()


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 15)]              0         
                                                                 
 embedding (Embedding)       (None, 15, 36)            1296      
                                                                 
 bidirectional (Bidirection  (None, 15, 128)           51712     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 15, 128)           0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 15, 256)           263168    
 onal)                                                           
                

In [24]:
letters = "ذهب علي الى الشاطيء 3 2 1 ثم لعب الكرة"
tokens = tokenize(letters)

results = ""

for word in tokens:
  if is_arabicrange(word):
    newWord = []
    for letter in word:
      newWord.append(ArabicCharacters_Mapping[letter.encode('utf-8')])
    padded_input = pad_sequences([newWord], maxlen=15, padding='post', truncating='post', value=-1)
    diacritics = trained_model.predict(padded_input)
    for j in range(0, len(word)):
        results += word[j]
        index = np.argmax(diacritics[0][j])
        results += ArabicDiacritics_RevMapping[index].decode('utf-8')
  else:
    results += word
  results += " "


# for i in range(0, len(tokens)):
#     letter_list = tokens[i]
#     diacritic_list = diacritics[i]
#     for j in range(0, len(letter_list)):
#         results += letter_list[j]
#         index = np.argmax(diacritic_list[j])
#         results += ArabicDiacritics_RevMapping[index].decode('utf-8')
#     results += " "

print(results)

ذَهَبَ عَلِيَّ الَّى الشَّاطِيءِ 3 2 1 ثُمَّ لَعِبٍّ الْكِرَّةِ 
