<a href="https://colab.research.google.com/github/BucketofJava/EnglishToPhonetics/blob/main/EnglishToPhonetics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#English to Phonetics

####Imports

In [None]:
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.1-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.1


In [None]:
import urllib.request as URL
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer
import keras.preprocessing.sequence as kps
from torch import nn;
from torch import Tensor;
from torch import optim;
from torch import reshape, argmax;
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader;
from torchinfo import summary;
import torch.cuda


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

##Data Processing

####Fetching Data

In [None]:
dictionary_url="https://github.com/cmusphinx/cmudict/raw/master/"
dictionary_fileName="cmudict.dict"
URL.urlretrieve(dictionary_url+dictionary_fileName, dictionary_fileName)
dict_file="/content/"+dictionary_fileName

In [None]:
f=open(dict_file, "r")
full_text=f.read()
f.close()
full_text_split=full_text.split("\n")
word_list=[]
phonetics_list=[]
for i in range(len(full_text_split)):
  full_text_split[i]=full_text_split[i].split(" ")
  full_text_split[i].insert(1, "SOS")
  full_text_split[i].append("EOS")
  word_list.append(list(full_text_split[i][0]))
  phonetics_list.append(full_text_split[i][1:])
print(full_text_split[:1000])
print(phonetics_list[:1000])

[["'bout", 'SOS', 'B', 'AW1', 'T', 'EOS'], ["'cause", 'SOS', 'K', 'AH0', 'Z', 'EOS'], ["'course", 'SOS', 'K', 'AO1', 'R', 'S', 'EOS'], ["'cuse", 'SOS', 'K', 'Y', 'UW1', 'Z', 'EOS'], ["'em", 'SOS', 'AH0', 'M', 'EOS'], ["'frisco", 'SOS', 'F', 'R', 'IH1', 'S', 'K', 'OW0', 'EOS'], ["'gain", 'SOS', 'G', 'EH1', 'N', 'EOS'], ["'kay", 'SOS', 'K', 'EY1', 'EOS'], ["'m", 'SOS', 'AH0', 'M', 'EOS'], ["'n", 'SOS', 'AH0', 'N', 'EOS'], ["'round", 'SOS', 'R', 'AW1', 'N', 'D', 'EOS'], ["'s", 'SOS', 'EH1', 'S', 'EOS'], ["'til", 'SOS', 'T', 'IH1', 'L', 'EOS'], ["'tis", 'SOS', 'T', 'IH1', 'Z', 'EOS'], ["'twas", 'SOS', 'T', 'W', 'AH1', 'Z', 'EOS'], ['a', 'SOS', 'AH0', 'EOS'], ['a(2)', 'SOS', 'EY1', 'EOS'], ["a's", 'SOS', 'EY1', 'Z', 'EOS'], ['a.', 'SOS', 'EY1', 'EOS'], ["a.'s", 'SOS', 'EY1', 'Z', 'EOS'], ['a.d.', 'SOS', 'EY2', 'D', 'IY1', 'EOS'], ['a.m.', 'SOS', 'EY2', 'EH1', 'M', 'EOS'], ['a.s', 'SOS', 'EY1', 'Z', 'EOS'], ['aaa', 'SOS', 'T', 'R', 'IH2', 'P', 'AH0', 'L', 'EY1', 'EOS'], ['aaberg', 'SOS', 'AA

###Tokenize phonetics & letters

In [None]:
letter_tokenizer=Tokenizer()

flattened_wordlist=[character for word in word_list for character in word]
letter_tokenizer.fit_on_texts(flattened_wordlist)
tokenized_words=letter_tokenizer.texts_to_sequences(word_list)
tokenized_words=kps.data_utils.pad_sequences(tokenized_words, padding="post", maxlen=32)[:]

In [None]:
phonetic_tokenizer=Tokenizer()
flattened_phoneticlist=[phoneme for word in phonetics_list for phoneme in word]
phonetic_tokenizer.fit_on_texts(flattened_phoneticlist)
tokenized_phonetics=phonetic_tokenizer.texts_to_sequences(phonetics_list)
tokenized_phonetics=kps.data_utils.pad_sequences(tokenized_phonetics, padding="post", maxlen=32)[:]

In [None]:
start_token=phonetic_tokenizer.texts_to_sequences([['SOS']])[0][0]

In [None]:
print(phonetic_tokenizer.texts_to_sequences([['SOS']]))

[[1]]


In [None]:
print(tokenized_phonetics)

[[ 1 16 52 ...  0  0  0]
 [ 1  9  3 ...  0  0  0]
 [ 1  9 33 ...  0  0  0]
 ...
 [ 1 13 36 ...  0  0  0]
 [ 1 13 11 ...  0  0  0]
 [ 1  2  0 ...  0  0  0]]


In [None]:
phonetic_length=tokenized_phonetics.shape[1]
word_length=tokenized_words.shape[1]

In [None]:
tokenized_phonetics=tokenized_phonetics.astype('float32')
tokenized_words=tokenized_words.reshape(tokenized_words.shape[0], tokenized_words.shape[1]).astype('float32')


In [None]:
print(tokenized_phonetics.shape)

(135156, 32)


In [None]:
indices=np.random.shuffle(np.array(range(tokenized_words.shape[0])))
tokenized_phonetics=tokenized_phonetics[indices]
tokenized_words=tokenized_words[indices]

In [None]:
tokenized_words=tokenized_words[0]
tokenized_phonetics=tokenized_phonetics[0]

In [None]:
split_index=int(tokenized_words.shape[0]*0.8)

words_train, words_test=(tokenized_words[:split_index], tokenized_words[split_index:])
phonetics_train, phonetics_test=(tokenized_phonetics[:split_index], tokenized_phonetics[split_index:])


In [None]:
print(words_train.shape)

(108124, 32)


In [None]:
words_train_tensor, words_test_tensor=(Tensor(words_train), Tensor(words_test))
phonetics_train_tensor, phonetics_test_tensor=(Tensor(phonetics_train), Tensor(phonetics_test))

In [None]:
print(phonetics_train_tensor[:6])

tensor([[ 1., 16., 52.,  7.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.],
        [ 1.,  9.,  3., 13.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.],
        [ 1.,  9., 33.,  8.,  5.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.],
        [ 1.,  9., 38., 36., 13.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.],
        [ 1.,  3., 12.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.],
        [ 1., 22.,  8., 21.,  5.,  9., 32.,  2.,  0.,  0.,  

In [None]:
print(words_train_tensor)

tensor([[24., 17.,  7.,  ...,  0.,  0.,  0.],
        [24., 10.,  2.,  ...,  0.,  0.,  0.],
        [24., 10.,  7.,  ...,  0.,  0.,  0.],
        ...,
        [ 5.,  1.,  2.,  ...,  0.,  0.,  0.],
        [ 5.,  1.,  2.,  ...,  0.,  0.,  0.],
        [ 5.,  1.,  2.,  ...,  0.,  0.,  0.]])


In [None]:
etp_train_dataset=TensorDataset(words_train_tensor.long(), phonetics_train_tensor.long())
etp_train_dataloader=DataLoader(etp_train_dataset, batch_size=1)
etp_test_dataset=TensorDataset(words_test_tensor.long(), phonetics_test_tensor.long())
etp_test_dataloader=DataLoader(etp_test_dataset, batch_size=1)

## Model Definition and Training

In [None]:
class ETPEncoder(nn.Module):
  def __init__(self, num_word_chars, embedding_size, hidden_state_size, num_encoder_layers):
    super().__init__()
    #Defining Encoder Model: 
    #Input: Sequence of tokens representing letters of a given word
    #Output: Cell states for each letter and the final hidden state
    self.encoder=nn.Sequential(
        #Embedding layer: Tokens -> Vectors
        nn.Embedding(num_word_chars, embedding_size),
        #LSTM layer(s): Vectorized Tokens -> Hidden and Output states
        nn.LSTM(input_size=embedding_size, hidden_size=hidden_state_size, num_layers=num_encoder_layers)
    );
  def forward(self, x):
    #Returns: LSTM output & Final Hidden State
    return self.encoder(x)

In [None]:
class ETPDecoder(nn.Module):
  def __init__(self, num_phonetic_chars, embedding_size, hidden_state_size, num_decoder_layers, desired_output_length=32):
    super().__init__()
    #Embedding: Tokens representing phonemes -> Vectors, Output Size=embedding_size
    self.decoder_embedding=nn.Embedding(num_phonetic_chars, embedding_size)
    #LSTM: Sequence of Embedding Vectors -> Output Value, Output Size=hidden_state_size
    self.decoder_lstm=nn.LSTM(input_size=embedding_size, hidden_size=hidden_state_size, num_layers=num_decoder_layers)
    #Feedforward: LSTM Output -> Weight for each phoneme for a given character, Output Size=num_phonetic_chars
    self.decoder_feedforward= nn.Linear(hidden_state_size, num_phonetic_chars)
    #Log Softmax: Feedforward output -> Softmax (Percent of each) -> Natural Logarithm
    self.decoder_activation=   nn.LogSoftmax(dim=1)
    self.desired_output_length=desired_output_length
  def forward(self, x, lstm_hidden):
    lstm_result=self.decoder_lstm(self.decoder_embedding(x), lstm_hidden)
    #Returns: Result of decoder given input & final hidden state
    return (self.decoder_activation(self.decoder_feedforward(lstm_result[0])), lstm_result[1])

In [76]:
class ETPModel(nn.Module):
  def __init__(self, num_word_chars, num_phonetic_chars, embedding_size, hidden_state_size, num_encoder_layers, num_decoder_layers, desired_output_length=32):
    super().__init__()
    #Defines Encoder Model
    self.encoder_model=ETPEncoder(num_word_chars, embedding_size, hidden_state_size, num_encoder_layers)
    #Defines Decoder Model
    self.decoder_model=ETPDecoder(num_phonetic_chars, embedding_size, hidden_state_size, num_decoder_layers, desired_output_length)
    #Defines Maximum Output Length
    self.desired_output_length=desired_output_length
    #self.decoder=nn.Sequential()
  def forward(self, x):
    #Encodes word
    encoder_result= self.encoder_model(x);
    #Creates the initial list of zeroes for the output sequence
    y0=np.zeros(self.desired_output_length)
    #Sets the first item of the output sequence to the start token (SOS)
    y0[0]=start_token
    #print(y0)
    #Initializes Empty list     
    rv=[]
    #Dummy variable for reporting last set of token probabilities in the case of an error
    pob=None
    #Defines the initial "y" values based on the encoder model
    y=(torch.tensor([y0]).long(), encoder_result[1]);
    for i in range(self.desired_output_length-1):
      #Gets the decoder output for the y value
      _y=self.decoder_model(y[0], y[1]);
      #Gets the probability of each token as outputted by the decoder model
      pprob=_y[0]
      #Grabs the current sequence
      y0=y[0][0]
    # #  print("a")
    #  # print(self.decoder_feedforward(_y[0]))
    #   _y0=Variable(_y[0].data, requires_grad=True)
    #   lin=self.decoder_feedforward(_y0)
    #   pprob=self.decoder_activation(lin)
      #Defines the token probabilities as that for the most recent token
      pprob=pprob[0][i]

      #Defines the next token in the sequence as the index of the greatest element of the phonetic probs 
      y0[i+1]=argmax(pprob)
    #print(y0)
      #Reshapes by wrapping in it a big mama array
      y0=reshape(y0, tuple([1]+list(y0.size())))
      rv.append(pprob)
   #   print(list(y0.size())) 
    #  print(reshape(y0, tuple([1]+list(y0.size()))))
     # print("b")
      #Sets y to the sequence and the last hidden state for the next
      y=(y0, _y[1])
     # print("c")
      pob=pprob
    return (y, rv);

      
  def train_model(self, dataloader, loss_func, encoder_optimizer, decoder_optimizer):
    c=0;
    #Runs the train functions in normal nn.Module
    self.train();
    #Loops through every (x, y) pair in every batch in the Dataloader
    for batch, (x, y) in enumerate(dataloader):
      #Gets the sequence prediction for x 
      prediction=self(x)
      if(c%997==0):
        print(x)
        print(prediction[1])
        print(y)
      prediction=prediction[1][0]
      # print("c")
      # print(x)
      # print(y)
      # print(prediction.float())
      prediction=prediction.float()
      prediction=Variable(prediction.data, requires_grad=True)
      #Gets the loss between the prediction and y
      loss=loss_func(prediction[0], y.float()[0])
     # print("d")
      #Zeroes the gradients of the optimizer
      encoder_optimizer.zero_grad()
      decoder_optimizer.zero_grad()
      #Connects loss to rest of network
      loss.backward()
      #Find dem gradients
      encoder_optimizer.step()
      decoder_optimizer.step()
      c+=1;




In [77]:
etpmodel=ETPModel(len(letter_tokenizer.word_index)+1, len(phonetic_tokenizer.word_index)+1, 32, 8, 1, 1).to(device)
#summary(etpmodel, (108124, 28), dtypes=[torch.long])

In [None]:
print(phonetic_tokenizer.word_index)

{'sos': 1, 'eos': 2, 'ah0': 3, 'n': 4, 's': 5, 'l': 6, 't': 7, 'r': 8, 'k': 9, 'd': 10, 'ih0': 11, 'm': 12, 'z': 13, 'er0': 14, 'iy0': 15, 'b': 16, 'eh1': 17, 'p': 18, 'aa1': 19, 'ae1': 20, 'ih1': 21, 'f': 22, 'g': 23, 'v': 24, 'iy1': 25, 'ng': 26, 'hh': 27, 'ey1': 28, 'w': 29, 'sh': 30, 'ow1': 31, 'ow0': 32, 'ao1': 33, 'ay1': 34, 'ah1': 35, 'uw1': 36, 'jh': 37, 'y': 38, 'aa0': 39, 'ch': 40, 'er1': 41, 'ih2': 42, 'eh2': 43, 'ey2': 44, 'ae2': 45, 'ay2': 46, 'aa2': 47, 'th': 48, 'eh0': 49, 'iy2': 50, 'ow2': 51, 'aw1': 52, 'uw0': 53, 'ao2': 54, 'ae0': 55, 'uh1': 56, 'ao0': 57, 'ay0': 58, 'uw2': 59, 'ah2': 60, 'ey0': 61, 'oy1': 62, 'aw2': 63, 'er2': 64, 'dh': 65, 'zh': 66, 'uh2': 67, 'aw0': 68, 'uh0': 69, 'oy2': 70, 'oy0': 71, 'irish': 72, 'place': 73, 'name': 74, 'foreign': 75, 'french': 76, 'org': 77, 'dutch': 78, 'abbrev': 79, 'danish': 80, 'german': 81, 'finnish': 82, 'old': 83, 'title': 84}


In [78]:
etpmodel.train_model(etp_train_dataloader, nn.CrossEntropyLoss(), optim.Adam(etpmodel.encoder_model.parameters(), lr=0.05), optim.Adam(etpmodel.decoder_model.parameters(), lr=0.05))

tensor([[24, 17,  7, 13,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
[tensor([-3.3089, -3.5110, -3.3575, -3.3494, -3.7236, -3.5284, -3.4594, -3.6572,
        -3.6540, -3.6239, -3.5278, -3.5449, -3.5387, -3.1702, -3.8113, -3.5920,
        -3.5515, -3.2771, -3.5053, -3.1429, -3.5682, -3.4974, -3.2894, -3.4059,
        -3.4667, -3.3935, -3.6003, -3.2293, -3.4714, -3.2142, -3.8569, -3.3112,
        -3.6542, -3.4347, -3.6175, -3.0987, -3.4800, -3.5788, -3.4758, -3.3821,
        -3.4088, -3.3220, -3.4233, -3.3916, -3.4564, -3.8460, -3.5064, -3.5276,
        -3.4680, -3.5163, -3.2747, -3.5556, -3.2794, -3.5634, -3.2123, -3.3417,
        -3.2674, -3.5565, -3.4726, -3.3813, -3.4477, -3.5359, -3.4294, -3.5250,
        -3.6248, -3.3326, -3.7012, -3.5700, -3.4019, -3.4109, -3.6433, -3.5471,
        -3.3325, -3.6211, -3.5982, -3.4930, -3.5061, -3.3384, -3.3658, -3.4414,
        -3.5551, -3.1571, -3.3961, -3.3008, -3.5122

IndexError: ignored

In [None]:
print(len(list(enumerate(etp_train_dataloader))))

108124


In [None]:
for batch, (x, y) in enumerate(etp_train_dataloader):
  print(etpmodel(x))

In [None]:

etpencodermodel=Sequential()
#tokenized_phonetics=tokenized_phonetics.reshape(tokenized_phonetics.shape[0], tokenized_phonetics.shape[1], 1).astype('float32')

print(tokenized_words[:1000])
etpmodel.add(Bidirectional(LSTM(word_length, return_sequences=True)))
etpmodel.add(Bidirectional(LSTM(2, return_sequences=True)))

etpmodel.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['acc'])

In [None]:
print()

28


In [None]:
etpmodel.fit(tokenized_words, tokenized_phonetics, epochs=3, validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff84fda7610>

In [None]:
word="balls"
tokenized_word_test=np.array(letter_tokenizer.texts_to_sequences([list(word)]))
print(tokenized_word_test)

[[17  2  9  9  5]]


In [None]:
print(etpmodel(tokenized_word_test.reshape(tokenized_word_test.shape[0], tokenized_word_test.shape[1], 1)))

tf.Tensor(
[[1.17963425e-26 4.67159884e-26 2.56787429e-26 2.14984857e-26
  3.22911522e-26 1.49747339e-26 2.03534264e-26 2.97221404e-26
  2.19658689e-26 4.62203712e-26 1.21531270e-25 1.12930615e-24
  1.66428336e-23 9.15431353e-22 9.30941438e-20 1.69180114e-16
  5.07423747e-13 1.65561387e-09 6.24991969e-07 2.17812922e-04
  3.88885126e-03 8.19307044e-02 7.55403265e-02 9.99388546e-02
  1.32787436e-01 3.24861586e-01 1.51023388e-01 1.29810423e-01]], shape=(1, 28), dtype=float32)
