In [1]:
import nltk
import os
import pickle
from more_itertools import pairwise

In [2]:
arpabet = nltk.corpus.cmudict.dict()

In [3]:
def to_phoneme(input_word):
    """
    This function responds to a request for /api/words
    with the complete lists of words

    :param: input_word: word to convert to arpabet
    :returns:        string of translated word
    """
    # Generate list of phonemes for the word
    phones = []

    # Take first variation of phoneme representations
    phones_w = arpabet[input_word][0]
    for p in phones_w:
        # omits the numbers from phone labels
        new_p = p[:2]
        phones.append(new_p)

    return phones

In [4]:
def to_syllables(input_phoneme):
    # Phoneme sonority values
    syl_dic = {
        'AA': 11,
        'AE': 11,
        'AH': 11,
        'AO': 11,
        'AW': 11,
        'AY': 11,
        'EH': 11,
        'ER': 11,
        'EY': 11,
        'IH': 11,
        'IY': 11,
        'OW': 11,
        'OY': 11,
        'UH': 11,
        'UW': 11,
        'Y': 10,
        'W': 10,
        'R': 9,
        'L': 8,
        'M': 7,
        'N': 7,
        'NG': 7,
        'Z': 6,
        'ZH': 6,
        'V': 6,
        'DH': 6,
        'S': 5,
        'SH': 5,
        'F': 5,
        'TH': 5,
        'HH': 5,
        'JH': 4,
        'CH': 3,
        'B': 2,
        'D': 2,
        'G': 2,
        'P': 1,
        'T': 1,
        'K': 1
    }

    # Split phoneme lists into syllables
    syllables = []
    phones = []
    phones.append(input_phoneme)
    for word in phones:
        boundary = 0
        for i in range(1, len(word)):
            if ((word[i] == 'K'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8)
                or (word[i] == 'T'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9)
                or (word[i] == 'P'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and word[i+1] != 'W')
                or (word[i] == 'B'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and word[i+1] != 'W')
                or (word[i] == 'G'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and not(word[i+1] in ['W', 'Y']))
                or (word[i] == 'D'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9)
                or (word[i] in ['CH', 'JH', 'HH', 'SH', 'DH', 'ZH', 'Z']
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'TH'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9
                    and word[i+1] != 'Y')
                or (word[i] in ['F', 'V']
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8)
                or (word[i] in ['N', 'M']
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'L'
                    and not(word[i-1]
                    in ['K', 'P', 'G', 'B', 'F', 'S', 'V'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'R'
                    and not(word[i-1]
                    in ['K', 'T', 'P', 'G', 'D', 'B', 'F', 'V'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'W'
                    and not (word[i-1] in ['K', 'T', 'D', 'TH', 'DH'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'Y'
                    and not (word[i-1] in ['K', 'P', 'F', 'V', 'B'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 7)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and word[i+1] == 'T'
                    and i+2 < len(word)
                    and syl_dic[word[i+2]] >= 9)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and (word[i+1] == 'K' or word[i+1] == 'P')
                    and i+2 < len(word) and syl_dic[word[i+2]] >= 8)
                or (syl_dic[word[i]] == 11
                    and (syl_dic[word[i-1]] == 11 or word[i-1] == 'NG'))):
                syllables.append(word[boundary:i])
                boundary = i

        syllables.append(word[boundary:])
        return syllables


In [5]:
fname = "syllables.txt"
if os.path.exists(fname):
    os.remove(fname)

In [6]:
all_syllables = []
with open(fname, "w+") as f:
    for word in arpabet:
        phoneme_word = to_phoneme(word)
        syllables_list = to_syllables(phoneme_word)
        for _syll in syllables_list:
            #f.write(word + ": ")
            all_syllables.append(_syll)
            for _phoneme in _syll:
                    f.write(_phoneme + " ")
            f.write("\n")

In [None]:
with open("pickleDumps/syllables.pki", "wb") as f:
    pickle.dump(all_syllables, f)

## Find adjacent phonemes in dict

In [62]:
#Find obsure phoneme pairs:
for word in arpabet:
    syllables_list = to_syllables(to_phoneme(word))
    for syll in syllables_list:
        for p1,p2 in pairwise(syll):
            if p1 == "T" and p2 == "W":
                print(word, ":", syllables_list)

antoine : [['AA', 'N'], ['T', 'W', 'AA', 'N']]
antoinette : [['AE', 'N'], ['T', 'W', 'AH'], ['N', 'EH', 'T']]
antwerp : [['AE', 'N'], ['T', 'W', 'ER', 'P']]
antwine : [['AE', 'N'], ['T', 'W', 'AY', 'N']]
artois : [['AA', 'R'], ['T', 'W', 'AA']]
artwork : [['AA', 'R'], ['T', 'W', 'ER', 'K']]
artworks : [['AA', 'R'], ['T', 'W', 'ER', 'K', 'S']]
attwood : [['AE'], ['T', 'W', 'UH', 'D']]
attwoods : [['AE'], ['T', 'W', 'UH', 'D', 'Z']]
atwater : [['AE'], ['T', 'W', 'AO'], ['T', 'ER']]
atwell : [['AH'], ['T', 'W', 'EH', 'L']]
atwood : [['AE'], ['T', 'W', 'UH', 'D']]
atworth : [['AE'], ['T', 'W', 'ER', 'TH']]
beltway : [['B', 'EH', 'L'], ['T', 'W', 'EY']]
best-western : [['B', 'EH'], ['S', 'T', 'W', 'EH'], ['S', 'T', 'ER', 'N']]
bestwick : [['B', 'EH'], ['S', 'T', 'W', 'IH', 'K']]
between : [['B', 'IH'], ['T', 'W', 'IY', 'N']]
betweens : [['B', 'IH'], ['T', 'W', 'IY', 'N', 'Z']]
betwixt : [['B', 'IY'], ['T', 'W', 'IH', 'K', 'S', 'T']]
bostwick : [['B', 'AA'], ['S', 'T', 'W', 'IH', 'K']]
boutw

repertoire : [['R', 'EH'], ['P', 'ER'], ['T', 'W', 'AA', 'R']]
rightward : [['R', 'AY'], ['T', 'W', 'ER', 'D']]
rightwing : [['R', 'AY'], ['T', 'W', 'IH', 'NG']]
right-winger : [['R', 'AY'], ['T', 'W', 'IH', 'NG'], ['ER']]
right-wingers : [['R', 'AY'], ['T', 'W', 'IH', 'NG'], ['ER', 'Z']]
saltwater : [['S', 'AO', 'L'], ['T', 'W', 'AA'], ['T', 'ER']]
sartwell : [['S', 'AA', 'R'], ['T', 'W', 'EH', 'L']]
shortwave : [['SH', 'AO', 'R'], ['T', 'W', 'EY', 'V']]
short-winded : [['SH', 'AO', 'R'], ['T', 'W', 'IH', 'N'], ['D', 'IH', 'D']]
shotwell : [['SH', 'AA'], ['T', 'W', 'EH', 'L']]
software : [['S', 'AO', 'F'], ['T', 'W', 'EH', 'R']]
software's : [['S', 'AO', 'F'], ['T', 'W', 'EH', 'R', 'Z']]
softwood : [['S', 'AO', 'F'], ['T', 'W', 'UH', 'D']]
statewide : [['S', 'T', 'EY'], ['T', 'W', 'AY', 'D']]
streetwise : [['S', 'T', 'R', 'IY'], ['T', 'W', 'AY', 'Z']]
swartwood : [['S'], ['W', 'AO', 'R'], ['T', 'W', 'UH', 'D']]
swartwout : [['S'], ['W', 'AO', 'R'], ['T', 'W', 'AW', 'T']]
sweetwater : 

In [13]:
vowels = ['AA',
        'AE',
        'AH',
        'AO',
        'AW',
        'AY',
        'EH',
        'ER',
        'EY',
        'IH',
        'IY',
        'OW',
        'OY',
        'UH',
        'UW']

In [None]:
# Find syllables with more than one vowel sound
for word in arpabet:
    syllables_list = to_syllables(to_phoneme(word))
    for syll in syllables_list:
        num_vowels = 0
        for p in syll:
            if p in vowels:
                num_vowels += 1
        if num_vowels > 1:
               print(syll)

In [28]:
#Find obsure phoneme pairs:
for word in arpabet:
    syllables_list = to_syllables(to_phoneme(word))
    for syll in syllables_list:
        for p1,p2 in pairwise(syll):
            if p1 == "DH" and p2 == "W":
                print(word, ":", syllables_list)

hatheway : [['HH', 'EY', 'DH', 'W', 'EY']]


In [27]:
to_syllables(["HH", "EY", "DH", "ER"])
to_phoneme("hatheway")

['HH', 'EY', 'DH', 'W', 'EY']

In [None]:
def to_syllables2(input_phoneme):
    # Phoneme sonority values
    syl_dic = {
        'AA': 11,
        'AE': 11,
        'AH': 11,
        'AO': 11,
        'AW': 11,
        'AY': 11,
        'EH': 11,
        'ER': 11,
        'EY': 11,
        'IH': 11,
        'IY': 11,
        'OW': 11,
        'OY': 11,
        'UH': 11,
        'UW': 11,
        'Y': 10,
        'W': 10,
        'R': 9,
        'L': 8,
        'M': 7,
        'N': 7,
        'NG': 7,
        'Z': 6,
        'ZH': 6,
        'V': 6,
        'DH': 6,
        'S': 5,
        'SH': 5,
        'F': 5,
        'TH': 5,
        'HH': 5,
        'JH': 4,
        'CH': 3,
        'B': 2,
        'D': 2,
        'G': 2,
        'P': 1,
        'T': 1,
        'K': 1
    }

    # Split phoneme lists into syllables
    syllables = []
    phones = []
    phones.append(input_phoneme)
    for word in phones:
        boundary = 0
        for i in range(1, len(word)):
            if ((word[i] == 'K'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8)
                or (word[i] == 'T'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9)
                or (word[i] == 'P'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and word[i+1] != 'W')
                or (word[i] == 'B'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and word[i+1] != 'W')
                or (word[i] == 'G'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and not(word[i+1] in ['W', 'Y']))
                or (word[i] == 'D'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9)
                or (word[i] in ['CH', 'JH', 'HH', 'SH', 'DH', 'ZH', 'Z']
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'TH'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9
                    and word[i+1] != 'Y')
                or (word[i] in ['F', 'V']
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8)
                or (word[i] in ['N', 'M']
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'L'
                    and not(word[i-1]
                    in ['K', 'P', 'G', 'B', 'F', 'S', 'V'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'R'
                    and not(word[i-1]
                    in ['K', 'T', 'P', 'G', 'D', 'B', 'F', 'V'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'W'
                    and not (word[i-1] in ['K', 'T', 'D', 'TH', 'DH'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'Y'
                    and not (word[i-1] in ['K', 'P', 'F', 'V', 'B'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 7)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and word[i+1] == 'T'
                    and i+2 < len(word)
                    and syl_dic[word[i+2]] >= 9)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and (word[i+1] == 'K' or word[i+1] == 'P')
                    and i+2 < len(word) and syl_dic[word[i+2]] >= 8)
                or (syl_dic[word[i]] == 11
                    and (syl_dic[word[i-1]] == 11 or word[i-1] == 'NG'))):
                syllables.append(word[boundary:i])
                boundary = i

        syllables.append(word[boundary:])
        return syllables


In [56]:
import cmudict
consonants  = [i[0] for i in cmudict.phones() if not i[1] == ['vowel']]

In [57]:
len(consonants)

24

In [58]:
vowels  = [i[0] for i in cmudict.phones() if i[1] == ['vowel']]

In [59]:
phonemes = consonants + vowels
phonemes

['B',
 'CH',
 'D',
 'DH',
 'F',
 'G',
 'HH',
 'JH',
 'K',
 'L',
 'M',
 'N',
 'NG',
 'P',
 'R',
 'S',
 'SH',
 'T',
 'TH',
 'V',
 'W',
 'Y',
 'Z',
 'ZH',
 'AA',
 'AE',
 'AH',
 'AO',
 'AW',
 'AY',
 'EH',
 'ER',
 'EY',
 'IH',
 'IY',
 'OW',
 'OY',
 'UH',
 'UW']