In [1]:
#Importations

import json
import re
import os

import numpy as np

from keras.models import load_model
from keras.preprocessing.text import Tokenizer, tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences

## Load Files

Loading trained models, the used tokenizer, needed txt files

In [2]:
Lemma_model = load_model("Models/Lemma_model.h5")

#Import the tokenizers from the training phase to tkenize new sentences for prediction.

TokenBase = "Models/Tokenizers/"
with open(TokenBase+'Letter_tokenizer.json') as f:
    data = json.load(f)
    Letter_tokenizer = tokenizer_from_json(data)


TxtBase = "Models/txt/"
StopWords = open(TxtBase+"StopWords.txt","r",encoding="utf-8").readlines()
for i in range(len(StopWords)):
    StopWords[i] = StopWords[i].replace("\n","")
Prepositions = open(TxtBase+"Prepositions.txt","r",encoding="utf-8").readlines()
for i in range(len(Prepositions)):
    Prepositions[i] = Prepositions[i].replace("\n","")
    
    
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
seq_length = 25

- Preprocessing functions to prepare text before prediction. 

In [3]:
arabic_diacritics = re.compile("""  ّ| َ| ً| ُ| ٌ| ِ| ٍ| ْ| ٰ """, re.VERBOSE)
def remove_diac(text):
    text = re.sub(arabic_diacritics, '', text)
    text = text.replace("ٱ", "ا")
    return text

In [4]:
#tokenize letters to numbers, pad them.
def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    seq = pad_sequences(seq, maxlen=length, padding='post',truncating='post')
    return seq

In [5]:
#Receives Text as input, clean it, split it, handels punctuation marks, returns it as list of words. 
def splitter(Text):
    Result = ''
    Result = ''.join([Result+(remove_diac(Word)) for Word in Text])   #Remove diacritics, it makes work a lot harded
    Result = re.findall(r"\w+|[^\w\s]", Result, re.UNICODE)           #Separate punctuation marks of words
    return(Result)

- Main predicting function to be used in the decoder

In [6]:
# Important function that returns False if machine wasn't sure of it's prediction
def MyArgmax(List):
    for item in List[0]:     
        index = np.argmax(item)
        value = item[index]
        if (value < 0.95):
            return False
    return True

In [7]:
def predicter(Text):

    #Text preprocessing from [Example] to [2,3,4,4,4,0,0,0] to get pridected
    Text_Split = Text.split()
    Text_join = [(' '.join(elem)) for elem in Text_Split]
    Word_Test = encode_sequences(Letter_tokenizer, seq_length, Text_join)
    data = (Word_Test.reshape((Word_Test.shape[0],Word_Test.shape[1])))
    #pridection
    if MyArgmax(Lemma_model.predict(data)):        
        preds = np.argmax(Lemma_model.predict(data), axis=-1)
    else:
        preds = Word_Test         # If machine wasn't 100% sure, return same input
    return preds

# The Decoder

the below cell works as the decoder, it takes the output prediction as sequence, decode it into text. 

In [8]:
# Receives one Sequnce, decode it into one Letter
def get_Letter(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

# Receives a word, Predict its result and outputs it as Sequences
def PredResult(Text):
    preds = predicter(Text)
    preds_text = []
    for lemma in preds:                  #handle each sequnce(One letter) individually using get_Letter()
        temp = []
        for letter in range(len(lemma)):
            resLetter = get_Letter(lemma[letter], Letter_tokenizer)                  #Decode Sequence 
            if (resLetter != None):
                temp.append(resLetter)
        preds_text.append(''.join(temp))     #If result was found, add the new letter to the previous to form a word
    return preds_text[0]

# Main function to extract lemmas

In [9]:
def GetLemma(Text):
    Result = ''
    Text_Split = splitter(Text)         #Split text to handle each word separately
    for Word in Text_Split:
        if (Word in StopWords) or (Word.isnumeric()) or (bool(re.match("[.!?:;,\\-]", Word))):
            Result = Result+" "+Word
        else:                                  #Sends word to decoder, add the result to five full output
            Result = Result+" "+str(PredResult(Word))
    return Result

In [13]:
Text = "يأكل يلعب يذهب يمرح يرى يتمشى استراتيجية اليقطين الأحمر تلاعب الحرير"
Result = GetLemma(Text)
print(Result)
# print(remove_diac(Result)) 

 أَكَل لَعِبَ ذَهَبَ مَرِحَ رَأَى تَمَشَّى استراتيجية اليقطين أَحْمَر تَلاعُب حَرِير
