In [18]:
import json
import numpy as np
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import Input
from tensorflow.keras.preprocessing.text import tokenizer_from_json

model = load_model('phoneme_model.h5')

In [19]:
encoder = model.get_layer(index=4)
char_input = model.get_layer(index=0)
ph_input = model.get_layer(index=1)
embedding_layer = model.get_layer(index=3)
decoder_lstm = model.get_layer(index=5)
softmax_dense = model.get_layer(index=6)

In [20]:
output_y, state_h, state_c = encoder.output
char_input = char_input.output
ph_input = ph_input.output

In [21]:
with open('char_tokenizer.json') as f:
    data = json.load(f)
    char_tokenizer = tokenizer_from_json(data)
with open('phone_tokenizer.json') as f:
    data = json.load(f)
    phone_tokenizer = tokenizer_from_json(data)

In [22]:
encoder = Model(char_input, [state_h, state_c])

#Decoder
decoder_input_h = Input(shape=(256,))
decoder_input_c = Input(shape=(256,))
x = embedding_layer(ph_input)
x, decoder_output_h, decoder_output_c = decoder_lstm(x, initial_state=[decoder_input_h, decoder_input_c])
x = softmax_dense(x)
decoder = Model([ph_input] + [decoder_input_h, decoder_input_c], 
                                [x] + [decoder_output_h, decoder_output_c])

In [16]:
def predict_pronunciation(ch_input):
    input_seq = char_tokenizer.texts_to_sequences([ch_input])

    next_h, next_c = encoder.predict(input_seq)

    curr_token = np.zeros((1,1))
    curr_token[0] = phone_tokenizer.word_index['startseq']

    pred_sentence = ''

    for i in range(21):
        output, next_h, next_c = decoder.predict([curr_token] + [next_h, next_c],verbose=0)
        next_token = np.argmax(output[0, 0, :])
        next_word = phone_tokenizer.index_word[next_token]
        if next_word == 'endseq':
            break
        else:
            pred_sentence += ' ' + next_word
            curr_token[0] = next_token

    return pred_sentence.replace("startseq","").replace("endseq","")

In [52]:
%reset