In [29]:
#Importations

import json
import re
import os

import numpy as np

from keras.models import load_model
from keras.preprocessing.text import Tokenizer, tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences

## Load Files

Loading trained models, the used tokenizer, needed txt files

In [30]:
Lemma_model = load_model("Models/Lemma_model.h5")
POS_model = load_model("Models/POS_model.h5")

#Import the tokenizers from the training phase to tkenize new sentences for prediction.

TokenBase = "Models/Tokenizers/"
with open(TokenBase+'Letter_tokenizer.json') as f:
    data = json.load(f)
    Letter_tokenizer = tokenizer_from_json(data)


TxtBase = "Models/txt/"
StopWords = open(TxtBase+"StopWords.txt","r",encoding="utf-8").readlines()
for i in range(len(StopWords)):
    StopWords[i] = StopWords[i].replace("\n","")
Prepositions = open(TxtBase+"Prepositions.txt","r",encoding="utf-8").readlines()
for i in range(len(Prepositions)):
    Prepositions[i] = Prepositions[i].replace("\n","")
    
    
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
seq_length = 25

- Preprocessing functions to prepare text before prediction. 

In [119]:
arabic_diacritics = re.compile("""  ّ| َ| ً| ُ| ٌ| ِ| ٍ| ْ| ٰ """, re.VERBOSE)
def remove_diac(text):
    text = re.sub(arabic_diacritics, '', text)
    text = text.replace("ٱ", "ا")
    return text

In [32]:
#tokenize letters to numbers, pad them.
def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    seq = pad_sequences(seq, maxlen=length, padding='post',truncating='post')
    return seq

In [33]:
#Receives Text as input, clean it, split it, handels punctuation marks, returns it as list of words. 
def splitter(Text):
    Result = ''
    Result = ''.join([Result+(remove_diac(Word)) for Word in Text])   #Remove diacritics, it makes work a lot harded
    Result = re.findall(r"\w+|[^\w\s]", Result, re.UNICODE)           #Separate punctuation marks of words
    return(Result)

- Main predicting function to be used in the decoder

In [34]:
def predicter(Text):

    #Text preprocessing from [Example] to [2,3,4,4,4,0,0,0] to get pridected
    Text_Split = Text.split()
    Text_join = [(' '.join(elem)) for elem in Text_Split]
    Word_Test = encode_sequences(Letter_tokenizer, seq_length, Text_join)

    #pridection
    preds = np.argmax(Lemma_model.predict((Word_Test.reshape((Word_Test.shape[0],Word_Test.shape[1])))), axis=-1)
    return preds

# The Decoder

the below cell works as the decoder, it takes the output prediction as sequence, decode it into text. 

In [35]:
# Receives one Sequnce, decode it into one Letter
def get_Letter(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

# Receives a word, Predict its result and outputs it as Sequences
def PredResult(Text):
    preds = predicter(Text)
    preds_text = []
    for lemma in preds:                  #handle each sequnce(One letter) individually using get_Letter()
        temp = []
        for letter in range(len(lemma)):
            resLetter = get_Letter(lemma[letter], Letter_tokenizer)                  #Decode Sequence 
            if (resLetter != None):
                temp.append(resLetter)
        preds_text.append(''.join(temp))     #If result was found, add the new letter to the previous to form a word
    return preds_text[0]

# Main function to extract lemmas

In [184]:
def GetLemma(Text):
    Result = ''
    Text_Split = splitter(Text)         #Split text to handle each word separately
    for Word in Text_Split:
        if (Word in StopWords) or (Word.isnumeric()) or (bool(re.match("[.!?:;,\\-]", Word))):
            Result = Result+" "+Word
        else:                                  #Sends word to decoder, add the result to five full output
            Result = Result+" "+str(PredResult(Word))
    return Result

# Main function to extract POS

In [199]:
def GetPOS(Text):
    Lemmas = GetLemma(Text).split()        # First get lemmas for better result
    Text = Text.split()
    result = []
    for i in range(len(Lemmas)):
        if (remove_diac(Lemmas[i]) in Prepositions):
            result.append(Lemmas[i]+"(حرف)")
        else:                                        # Preproccess before prediction
            padded = encode_sequences(Letter_tokenizer, seq_length, [' '.join((list(Lemmas[i])))])
            pred = POS_model.predict(padded)
            labels = ['اسم - صفة', 'اسم - حال', 'اسم', 'حرف', 'اسم - ضمير', 'فعل']
            result.append(Text[i]+" "+"("+labels[np.argmax(pred)]+")")
            Result = ' '.join(list(result))
    return Result 

In [200]:
# Helper function for GetAssociation, returns the needed POS for Association
#smaller version of the above function

def GetSmallPOS(word):
    padded = encode_sequences(Letter_tokenizer, seq_length, [' '.join((list(word)))])
    pred = POS_model.predict(padded)
    labels = ['3', 'n', '2', 'n', 'n', '1']
    result = labels[np.argmax(pred)]
    return result

# Main function to extract POS

In [201]:
def GetAssociation(Text):
    Lemma = GetLemma(Text).split()        # First get lemmas for better result
    Text = Text.split()
    result = []
    for i in range(len(Lemma)):        # every round of the loop, we will use the Current word and the next one
        if (i == len(Lemma)-1):        # If this is the last word, break
            break
        if (remove_diac(Lemma[i]) in Prepositions):
            continue
        CurrentWord = GetSmallPOS(Lemma[i])
        NextWord = GetSmallPOS(Lemma[i+1])
                         # if the Current word's result is 1 and the next's result is 2 this is Association, and so on..
        if ((CurrentWord == "1") and (NextWord == "2")) or ((CurrentWord == "2") and (NextWord == "3")):
            result.append(Text[i]+"-"+Text[i+1])
    return result

In [210]:
Text = "يذهب الرجل في مساء كل يوم لشراء العرض المجاني"

In [211]:
GetLemma(Text)

' ذَهَبَ رَجُل في مساء كُلّ يَوْم شِراء عَرْض مَجّانِيّ'

In [212]:
GetPOS(Text)

'يذهب (فعل) الرجل (اسم) في(حرف) مساء (اسم) كل (اسم) يوم (اسم) لشراء (اسم) العرض (اسم) المجاني (اسم - صفة)'

In [213]:
GetAssociation(Text)

['يذهب-الرجل', 'العرض-المجاني']

In [22]:
import re

bool(re.match("[.!?:;,\\-]", ":"))

True