In [1]:
import nltk
import os
import pickle
import cmudict
from more_itertools import pairwise

In [2]:
arpabet = nltk.corpus.cmudict.dict()

In [3]:
def to_phoneme(input_word):
    """
    This function responds to a request for /api/words
    with the complete lists of words

    :param: input_word: word to convert to arpabet
    :returns:        string of translated word
    """
    # Generate list of phonemes for the word
    phones = []

    # Take first variation of phoneme representations
    phones_w = arpabet[input_word][0]
    for p in phones_w:
        # omits the numbers from phone labels
        new_p = p[:2]
        phones.append(new_p)

    return phones

In [4]:
def to_syllables(input_phoneme):
    # Phoneme sonority values
    syl_dic = {
        'AA': 11,
        'AE': 11,
        'AH': 11,
        'AO': 11,
        'AW': 11,
        'AY': 11,
        'EH': 11,
        'ER': 11,
        'EY': 11,
        'IH': 11,
        'IY': 11,
        'OW': 11,
        'OY': 11,
        'UH': 11,
        'UW': 11,
        'Y': 10,
        'W': 10,
        'R': 9,
        'L': 8,
        'M': 7,
        'N': 7,
        'NG': 7,
        'Z': 6,
        'ZH': 6,
        'V': 6,
        'DH': 6,
        'S': 5,
        'SH': 5,
        'F': 5,
        'TH': 5,
        'HH': 5,
        'JH': 4,
        'CH': 3,
        'B': 2,
        'D': 2,
        'G': 2,
        'P': 1,
        'T': 1,
        'K': 1
    }

    # Split phoneme lists into syllables
    syllables = []
    phones = []
    phones.append(input_phoneme)
    for word in phones:
        boundary = 0
        for i in range(1, len(word)):
            if ((word[i] == 'K'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8)
                or (word[i] == 'T'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9)
                or (word[i] == 'P'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and word[i+1] != 'W')
                or (word[i] == 'B'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and word[i+1] != 'W')
                or (word[i] == 'G'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and not(word[i+1] in ['W', 'Y']))
                or (word[i] == 'D'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9)
                or (word[i] in ['CH', 'JH', 'HH', 'SH', 'DH', 'ZH', 'Z']
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'TH'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9
                    and word[i+1] != 'Y')
                or (word[i] in ['F', 'V']
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8)
                or (word[i] in ['N', 'M']
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'L'
                    and not(word[i-1]
                    in ['K', 'P', 'G', 'B', 'F', 'S', 'V'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'R'
                    and not(word[i-1]
                    in ['K', 'T', 'P', 'G', 'D', 'B', 'F', 'V'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'W'
                    and not (word[i-1] in ['K', 'T', 'D', 'TH', 'DH'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'Y'
                    and not (word[i-1] in ['K', 'P', 'F', 'V', 'B'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 7)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and word[i+1] == 'T'
                    and i+2 < len(word)
                    and syl_dic[word[i+2]] >= 9)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and (word[i+1] == 'K' or word[i+1] == 'P')
                    and i+2 < len(word) and syl_dic[word[i+2]] >= 8)
                or (syl_dic[word[i]] == 11
                    and (syl_dic[word[i-1]] == 11 or word[i-1] == 'NG'))):
                syllables.append(word[boundary:i])
                boundary = i

        syllables.append(word[boundary:])
        return syllables


In [5]:
fname = "syllables.txt"
if os.path.exists(fname):
    os.remove(fname)

In [6]:
all_syllables = []
with open(fname, "w+") as f:
    for word in arpabet:
        phoneme_word = to_phoneme(word)
        syllables_list = to_syllables(phoneme_word)
        for _syll in syllables_list:
            #f.write(word + ": ")
            all_syllables.append(_syll)
            for _phoneme in _syll:
                    f.write(_phoneme + " ")
            print(_)
            f.write("\n")





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [7]:
with open("pickleDumps/syllables.pki", "wb") as f:
    pickle.dump(all_syllables, f)

## Find adjacent phonemes in dict

In [8]:
#Find syllables with >1 vowels
vowels  = [i[0] for i in cmudict.phones() if i[1] == ['vowel']]
for word in arpabet:
    syllables_list = to_syllables(to_phoneme(word))
    num_v=0
    for syll in syllables_list: 
        for p in syll:
            if p in vowels:
                num_v +=1
    if num_v==0:
            print(word, ": ", syllables_list)

fs :  [['F', 'S']]
mmmm :  [['M', 'M']]
shh :  [['SH']]
ths :  [['TH', 'S']]


In [9]:
#Find obsure phoneme pairs:
for word in arpabet:
    syllables_list = to_syllables(to_phoneme(word))
    for syll in syllables_list:
        for p1,p2 in pairwise(syll):
            if p1 == "D" and p2 == "T":
                print(word, ": ", syllables_list)

borchardt :  [['B', 'ER'], ['SH', 'AA', 'R', 'D', 'T']]
bradt :  [['B', 'R', 'AE', 'D', 'T']]
brodt :  [['B', 'R', 'AA', 'D', 'T']]
cordts :  [['K', 'AO', 'R', 'D', 'T', 'S']]
friedt :  [['F', 'R', 'IY', 'D', 'T']]
gerdts :  [['JH', 'ER', 'D', 'T', 'S']]
heidt :  [['HH', 'AY', 'D', 'T']]
heydt :  [['HH', 'EY', 'D', 'T']]
kludt :  [['K', 'L', 'AH', 'D', 'T']]
quandt :  [['K', 'W', 'AO', 'N', 'D', 'T']]
reidt :  [['R', 'IY', 'D', 'T']]
smidt :  [['S', 'M', 'IH', 'D', 'T']]
stadt :  [['S', 'T', 'AE', 'D', 'T']]
staudt :  [['S', 'T', 'AO', 'D', 'T']]
stoudt :  [['S', 'T', 'AH', 'D', 'T']]
studt :  [['S', 'T', 'AH', 'D', 'T']]
tiedt :  [['T', 'IY', 'D', 'T']]
todt :  [['T', 'AA', 'D', 'T']]
traudt :  [['T', 'R', 'AO', 'D', 'T']]
troudt :  [['T', 'R', 'AW', 'D', 'T']]
tvedt :  [['T'], ['V', 'EH', 'D', 'T']]
twedt :  [['T', 'W', 'EH', 'D', 'T']]
yundt :  [['Y', 'AH', 'N', 'D', 'T']]


In [10]:
#Find obsure phoneme tuples:
for word in arpabet:
    syllables_list = to_syllables(to_phoneme(word))
    for syll in syllables_list:
        for p1,p2 ,p3 in list(nltk.ngrams(syll, 3)):
            if p1 == "<s>" and p2 == "HH"and p3=="R":
                print(word, ": ", syllables_list)

In [11]:
vowels = ['AA',
        'AE',
        'AH',
        'AO',
        'AW',
        'AY',
        'EH',
        'ER',
        'EY',
        'IH',
        'IY',
        'OW',
        'OY',
        'UH',
        'UW']

In [12]:
# Find syllables with more than one vowel sound
for word in arpabet:
    syllables_list = to_syllables(to_phoneme(word))
    for syll in syllables_list:
        num_vowels = 0
        for p in syll:
            if p in vowels:
                num_vowels += 1
        if num_vowels > 1:
               print(syll)

['HH', 'EY', 'DH', 'W', 'EY']


In [13]:
#Find obsure phoneme pairs:
for word in arpabet:
    syllables_list = to_syllables(to_phoneme(word))
    for syll in syllables_list:
        for p1,p2 in pairwise(syll):
            if p1 == "IY" and p2 == "Y":
                print(word, ":", syllables_list)

In [14]:
to_syllables(["HH", "EY", "DH", "ER"])
to_phoneme("hatheway")

['HH', 'EY', 'DH', 'W', 'EY']

NameError: name 'pronouncable' is not defined