In [2]:
import re
import io
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Input, LSTM, Dense, Bidirectional, Concatenate, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping




In [3]:
words = []
phonics_list = []
dict_file = open('./cmudict-01.7b', 'r')
with dict_file as f:
    phonics = [line.rstrip('\n') for line in f]
    for p in phonics:
        x = p.split(' ')
        words.append(x[0])
        phonics_list.append(' '.join(x[1:]))
data = pd.DataFrame({'Word': words, 'Phonics': phonics_list})

data = data.drop(data[data["Word"].str.find("'") == 0].index)
data

Unnamed: 0,Word,Phonics
0,!EXCLAMATION-POINT,EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T
1,"""CLOSE-QUOTE",K L OW1 Z K W OW1 T
2,"""DOUBLE-QUOTE",D AH1 B AH0 L K W OW1 T
3,"""END-OF-QUOTE",EH1 N D AH0 V K W OW1 T
4,"""END-QUOTE",EH1 N D K W OW1 T
...,...,...
133849,{BRACE,B R EY1 S
133850,{LEFT-BRACE,L EH1 F T B R EY1 S
133851,{OPEN-BRACE,OW1 P EH0 N B R EY1 S
133852,}CLOSE-BRACE,K L OW1 Z B R EY1 S


In [4]:
ILLEGAL_CHAR_REGEX = "[^A-Z']"

MAX_DICT_WORD_LEN = 20
MIN_DICT_WORD_LEN = 2

df = data

def is_alternate_pho_spelling(word):
    if word[-1] == ')' and word[-3] == '(' and word[-2].isdigit():
        return word[:word.find('(')]
    else:
        return word

def should_skip(word):
    if not word[0].isalpha():  # skip symbols
        return True
    if word[-1] == '.':  # skip abbreviations
        return True
    if re.search(ILLEGAL_CHAR_REGEX, word):
        return True
    if len(word) > MAX_DICT_WORD_LEN:
        return True
    if len(word) < MIN_DICT_WORD_LEN:
        return True
    return False
df["Word"] = df["Word"].map(is_alternate_pho_spelling)
df = df[~df["Word"].apply(lambda x : should_skip(x))]
df = df.iloc[np.random.permutation(len(df))].reset_index(drop=True)
df["Phonics"] = df["Phonics"].str.replace(r'[0-9]', '',regex=True)
df

Unnamed: 0,Word,Phonics
0,SEQUENCING,S IY K W AH N S IH NG
1,REITERA,R IY IH T ER AH
2,DIETERICH,D IY T ER IH K
3,NOVACEK,N AA V AH CH EH K
4,MEDICINAL,M AH D IH S AH N AH L
...,...,...
132550,KRAVCHUK,K R AA V CH UH K
132551,DORRELL,D AO R EY L
132552,MOUNTAINS,M AW N T AH N Z
132553,TELMEX,T EH L M EH K S


In [5]:
df['Phonics'] = 'startseq '+df['Phonics']+' endseq'
df

Unnamed: 0,Word,Phonics
0,SEQUENCING,startseq S IY K W AH N S IH NG endseq
1,REITERA,startseq R IY IH T ER AH endseq
2,DIETERICH,startseq D IY T ER IH K endseq
3,NOVACEK,startseq N AA V AH CH EH K endseq
4,MEDICINAL,startseq M AH D IH S AH N AH L endseq
...,...,...
132550,KRAVCHUK,startseq K R AA V CH UH K endseq
132551,DORRELL,startseq D AO R EY L endseq
132552,MOUNTAINS,startseq M AW N T AH N Z endseq
132553,TELMEX,startseq T EH L M EH K S endseq


In [6]:
max_len_ip = df['Word'].apply(lambda x: len(x)).max()
max_len_op = df['Phonics'].apply(lambda x: len(x.split())).max()

words = df['Word'].tolist()
phonemes = df['Phonics'].tolist()

In [7]:
char_tokenizer = Tokenizer(char_level=True,oov_token='OOV')
char_tokenizer.fit_on_texts(words)
char_vocab_size = len(char_tokenizer.word_index) + 1
print("Character Vocab Size: ",char_vocab_size)
ch_vocab = char_vocab_size

phone_tokenizer = Tokenizer(oov_token='OOV')
phone_tokenizer.fit_on_texts(phonemes)
phone_vocab_size = len(phone_tokenizer.word_index) + 1
print("Phoneme Vocab Size: ",phone_vocab_size)
ph_vocab = phone_vocab_size

Character Vocab Size:  29
Phoneme Vocab Size:  43


In [8]:
char_tokenizer_json = char_tokenizer.to_json()
with io.open('char_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(char_tokenizer_json, ensure_ascii=False))

phone_tokenizer_json = phone_tokenizer.to_json()
with io.open('phone_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(phone_tokenizer_json, ensure_ascii=False))

In [9]:
ch_sequences = char_tokenizer.texts_to_sequences(words)
ph_sequences = phone_tokenizer.texts_to_sequences(phonemes)

In [10]:
encoder_inputs = pad_sequences(ch_sequences, maxlen=max_len_ip, padding='post')

decoder_inputs = []
decoder_outputs = []

for ph in ph_sequences:
    decoder_inputs.append(ph[:-1])
    decoder_outputs.append(ph[1:])

decoder_inputs = pad_sequences(decoder_inputs, maxlen=max_len_op, padding='post')
decoder_outputs = pad_sequences(decoder_outputs, maxlen=max_len_op, padding='post')

In [11]:
n = len(words)
split_index = int(0.99 * n)

X_train = [encoder_inputs[:split_index], decoder_inputs[:split_index]]
y_train = decoder_outputs[:split_index]

X_test = words[split_index:]
y_test = phonemes[split_index:]

In [12]:
# Encoder
char_input = Input(shape=(None,))
x = Embedding(ch_vocab, 256, mask_zero=True)(char_input)
output_y, state_h, state_c = LSTM(256, return_state=True)(x)

# Decoder
ph_input = Input(shape=(None,))
embedding_layer = Embedding(ph_vocab, 256, mask_zero=True)
x = embedding_layer(ph_input)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
output_y, _ , _ = decoder_lstm(x, initial_state=[state_h, state_c])
softmax_dense = Dense(ph_vocab, activation='softmax')
output = softmax_dense(output_y)

model = Model(inputs=[char_input, ph_input],outputs=output)
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam')
model.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 256)            7424      ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            11008     ['input_2[0][0]']             
                                                                                            

In [13]:
model_name = "model.h5"
checkpoint = ModelCheckpoint(model_name,
                            monitor="val_loss",
                            mode="min",
                            save_best_only = True,
                            verbose=1)

earlystopping = EarlyStopping(monitor='val_loss',min_delta = 0, patience = 3, verbose = 1, restore_best_weights=True)

In [34]:
history = model.fit(X_train,y_train,
                    batch_size=32,
                    epochs=20,
                    validation_split=0.01,
                    validation_batch_size=32,
                    callbacks=[checkpoint,earlystopping])

Epoch 1/20

Epoch 1: val_loss improved from inf to 0.36097, saving model to model.h5
Epoch 2/20


  saving_api.save_model(


Epoch 2: val_loss improved from 0.36097 to 0.28175, saving model to model.h5
Epoch 3/20
Epoch 3: val_loss improved from 0.28175 to 0.24947, saving model to model.h5
Epoch 4/20
Epoch 4: val_loss improved from 0.24947 to 0.23816, saving model to model.h5
Epoch 5/20
Epoch 5: val_loss improved from 0.23816 to 0.22695, saving model to model.h5
Epoch 6/20
Epoch 6: val_loss improved from 0.22695 to 0.22109, saving model to model.h5
Epoch 7/20
Epoch 7: val_loss did not improve from 0.22109
Epoch 8/20
Epoch 8: val_loss did not improve from 0.22109
Epoch 9/20
Epoch 9: val_loss did not improve from 0.22109
Restoring model weights from the end of the best epoch: 6.
Epoch 9: early stopping


In [19]:
# tf.keras.models.load_model(model_name)

model.save("phoneme_model.h5")


  saving_api.save_model(


In [16]:
encoder = Model(char_input, [state_h, state_c])

#Decoder
decoder_input_h = Input(shape=(256,))
decoder_input_c = Input(shape=(256,))
x = embedding_layer(ph_input)
x, decoder_output_h, decoder_output_c = decoder_lstm(x, initial_state=[decoder_input_h, decoder_input_c])
x = softmax_dense(x)
decoder = Model([ph_input] + [decoder_input_h, decoder_input_c], 
                                [x] + [decoder_output_h, decoder_output_c])

In [17]:
def predict_pronunciation(ch_input):
    input_seq = char_tokenizer.texts_to_sequences([ch_input])

    next_h, next_c = encoder.predict(input_seq)

    curr_token = np.zeros((1,1))
    curr_token[0] = phone_tokenizer.word_index['startseq']

    pred_sentence = ''

    for i in range(max_len_op):
        output, next_h, next_c = decoder.predict([curr_token] + [next_h, next_c],verbose=0)
        next_token = np.argmax(output[0, 0, :])
        next_word = phone_tokenizer.index_word[next_token]
        if next_word == 'endseq':
            break
        else:
            pred_sentence += ' ' + next_word
            curr_token[0] = next_token

    return pred_sentence.replace("startseq","").replace("endseq","")

In [38]:

def predict_acc(x,y):
    x_len = len(x)
    right = 0
    for i in range(x_len):
        phoneme = predict_pronunciation(x[i])
        print(y[i].replace("startseq","").replace("endseq",""), phoneme.upper())
        if  ''.join(filter(lambda x: not x.isdigit(), y[i].replace("startseq","").replace("endseq","").strip())) == ''.join(filter(lambda x: not x.isdigit(), phoneme.upper().strip())):
            right = right + 1

    print(right/x_len)

print(len(X_test))
predict_acc(X_test,y_test)


1326
  K AE L IH F AO R N Y AH N Z   K AE L IH F AO R N IY AH N Z
  IH G Z AE M   IH G Z AA M
  N AE SH AH N AE L IH T IY   N AE SH AH N AE L AH T IY
  D EH L AH HH AH N T IY   D EH L AH HH AE N T IY
  G R AE N D AO T ER   G R AE D N AO R T ER
  OW V ER Z EH L AH S   OW V ER Z IY L AH S
  G R EY N JH ER   G R EY N JH ER
  R IH CH K R IY K   R IH CH R IY K
  R AH S K IH N   R AH S K IH N
  JH AH S K OW   JH UW S K OW
  M AE S T ER P IY S IH Z   M AE S T ER P IY AH S IH Z
  M AA R Z   M AA R Z
  S IH G N IH T   S IH N IH T
  IH M IH T S   EH M IH T S
  EH N T ER T EY N IH NG   EH N ER T EY N IH NG
  M AH M B AH L   M AH M B AH L
  K IH L ER Z   K IH L ER Z
  B AO L S AH M   B AE L S AH M
  T W AA NG   T W AE NG
  M AA R JH R IY   M AA R JH ER IY
  B EH L Z B ER G Z   B EH L T S B ER G Z
  M OW P T   M OW P T
  AW T G R OW Z   AW T G R OW Z
  N UW T ER   N UW T ER
  S EY F K AA R D Z   S AE F IY K AA D Z
  AA N T OW N IY N IY   AA N T OW N IY N IY
  SH AH L T AH N   SH AH L T AH N
  B IH 