In [1]:
#Importations

import json
import re
import io
import os

import tensorflow as tf
import pandas as pd

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import optimizers
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, RepeatVector, SpatialDropout1D

## Load Files

Loading Data as list for each(Words, Lemmas, POS) to use for model training

In [None]:
Data_file = "Madmira_data.json"

with open(Data_file, 'r', encoding='utf-8') as f:
    datastore = json.load(f)

Words = []
Lemmas = []
POS = []
for item in datastore:
    Words.append(' '.join((list(item['Word']))))
    Lemmas.append(' '.join((list(item['Lemma']))))
    POS.append(item['POS'])

## Preprocessing

- Preprocessing functions to prepare text(tokenize, sequence, pad) before training.

In [4]:
def tokenization(Data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(Data)
    return tokenizer

In [5]:
def Tokenizer_saver(Name,tokenizer):
    tokenizer_json = tokenizer.to_json()
    with io.open((Name+".json"), 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [8]:
def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    seq = pad_sequences(seq, maxlen=length, padding='post',truncating='post')
    print(seq)
    return seq

In [None]:
# Tokenizer declaration and initialization and saving for later
Letter_tokenizer = tokenization(Words+Lemmas)
Word_letter_size = len(Letter_tokenizer.word_index) + 1
Tokenizer_saver("Letter_tokenizer",Letter_tokenizer)      #Save model to use later for prediction.
Word_length = 25     #Max sequence length

In [None]:
# Lemma data preprocessing
Word_Train = encode_sequences(Letter_tokenizer, Word_length, Words)
Lemma_Train = encode_sequences(Letter_tokenizer, Word_length, Lemmas)

# POS data preprocessing
POS_Train = encode_sequences(Letter_tokenizer, Word_length, Lemmas)
POS_Lable = pd.get_dummies(POS).values

## Lemma Model

In [11]:
# build NMT model
units = 512
LemmaModel = Sequential()
LemmaModel.add(Embedding(Word_letter_size, units, input_length=Word_length, mask_zero=True))
LemmaModel.add(LSTM(units))
LemmaModel.add(RepeatVector(Word_length))
LemmaModel.add(LSTM(units, return_sequences=True))
LemmaModel.add(Dense(Word_letter_size, activation='softmax'))

In [12]:
# Set model optimizer and loss
rms = tf.keras.optimizers.RMSprop(learning_rate=0.001)
lss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False,name='CategoricalCrossentropy')
LemmaModel.compile(optimizer=rms, loss=lss)

In [None]:
filename = 'Lemma_model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min',save_format="h5")

history = LemmaModel.fit(Word_Train, Lemma_Train.reshape(Lemma_Train.shape[0], Lemma_Train.shape[1], 1), 
          epochs=100, batch_size=512, 
          validation_split = 0.05,
          callbacks=[checkpoint], verbose=1)

## POS Model

In [None]:
POSmodel = Sequential()
POSmodel.add(Embedding(50, 16, input_length=POS_Train.shape[1]))
POSmodel.add(SpatialDropout1D(0.2))
POSmodel.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
POSmodel.add(Dense(POS_Lable.shape[1], activation='softmax'))
POSmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


history = POSmodel.fit(POS_Train, POS_Lable, epochs=50, batch_size=128,validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
POSmodel.save('POS_model.h5')