In [1]:
import os
import pickle
import cmudict
from nltk import ngrams
from nltk import word_tokenize 
import nltk 
from itertools import permutations
from collections import OrderedDict

In [3]:
accepted_phonemes = [i[0] for i in cmudict.phones()]

In [261]:
pickleDumpsPath = "pickleDumps/"
syllablesPath = "syllables.pki"
phonemeCondProbsPath = pickleDumps + "phonemeCondProbs.pki"
bigramsDictPath = pickleDumpsPath + "bigramsDict.pki"

In [5]:
unigramsDict = dict([(char, 0) for char in accepted_phonemes])
#unigramsDict

In [6]:
# All possible phoneme pairs
phoneme_pairs = list(permutations(accepted_phonemes, 2))
#phoneme_pairs

In [7]:
# Count of all possible pairs
bigramsDict = dict([(char, 0) for char in phoneme_pairs])
condProbsDict = dict([(char, 0) for char in phoneme_pairs])

In [346]:
with open(pickleDumpsPath+syllablesPath, "rb") as f:
    all_syllables = pickle.load(f)

In [347]:
for line in all_syllables:
    #line_split = nltk.word_tokenize(line)

    # Count unigrams (phonemes)
    for phoneme in line:
        unigramsDict[phoneme] += 1

    # Count bigrams: {"AH T" : 1, "AH K" : 3, ...}
    bigrams = list(ngrams(line, 2))
    for bigram in bigrams:
        #key = (p1, p2)
        #print(key)
        if bigram in bigramsDict:
            bigramsDict[bigram] += 1
        else:
            bigramsDict[bigram] = 1

In [348]:
# for p in sorted(unigramsDict, key=unigramsDict.get, reverse=True):
#     print (p,":", unigramsDict[p])

In [349]:
# for p, q in sorted(bigramsDict, key=bigramsDict.get, reverse=True):
#     print (p,q,":", bigramsDict[(p,q)])

In [249]:
for p1, p2 in bigramsDict:
    count = bigramsDict[(p1, p2)]
    cProb = count*1.0 / unigramsDict[p1]
    condProbsDict[(p1, p2)] = cProb

In [250]:
with open(phonemeCondProbsPath, "wb") as f:
    pickle.dump(condProbsDict, f)

## TESTING

In [335]:
import pickle
import nltk
from collections import OrderedDict
import heapq
import cmudict
import random

In [252]:
pickleDumpsPath = "pickleDumps/"
phonemeCondProbsPath = pickleDumps + "phonemeCondProbs.pki"
bigramsDictPath = pickleDumpsPath + "bigramsDict.pki"

In [253]:
with open(phonemeCondProbsPath, "rb") as f:
    condProbsDict = pickle.load(f)

In [324]:
def pronouncable(syllable: str, thresh: float):
    """
    :param syllable: Syllable to test ["AH K T]
    :param thresh: Minimum acceptable value for bigram conditional prob 
    :returns: True if syllable is pronouncable
    """
    syllable_split = nltk.word_tokenize(syllable)
    phoneme_consonants = [i[0] for i in cmudict.phones() if not i[1] == ['vowel']]
    if len(syllable_split) == 0: # Emtpy Syllable
        return True
    if all(p in phoneme_consonants for p in syllable_split):  #No vowel sounds
        return False
    else:
        bigrams = list(ngrams(syllable_split, 2))
        # Compute conditional probabilities for phoneme bigrams
        cond_probs = list(map(lambda pair: condProbsDict[pair], bigrams))
        # Are all cond probs above threshold value
        return all(cond_prob > thresh for cond_prob in cond_probs)

In [325]:
# Test unpronouncable words
assert(pronouncable("T AH T K", 0.001) == False)
assert(pronouncable("ER T L", 0.001) == False)
assert(pronouncable("S D", 0.001) == False)
assert(pronouncable("F NG L T R", 0.001) == False)

In [319]:
def test_unpronouncable(thresh=0.001):
    # Assert all words are returned NOT pronouncable
    test_words = ["T AH T K", "ER T L", "S D", "F NG L T"]
    for w in test_words: assert(pronouncable(w, thresh) == False)

In [342]:
def test_pronouncable(thresh=0.001):
    # Assert all words are returned pronouncable
    
    with open(pickleDumpsPath+syllablesPath, "rb") as f:
        all_syllables = pickle.load(f)
    
    random_word = " ".join(random.choice(all_syllables))
    assert(pronouncable(random_word, thresh) == True)

In [343]:
test_unpronouncable()

In [344]:
test_pronouncable()

## More Testing

In [351]:
pronounable("OW S T K T", 0.01)

False

In [257]:
test = "T K L M NG ER"
test_split = nltk.word_tokenize(test)
bigramsf = list(ngrams(test_split, 2))
test_split

['T', 'K', 'L', 'M', 'NG', 'ER']

In [230]:
cond_probs = list(map(lambda pair: condProbsDict[pair], bigramsf))
cond_probs

[0.0, 0.04846338687239155, 0.0022081812020376484, 0.0, 0.0]

In [231]:
thresh = 0.004
res = all(cond_prob > thresh for cond_prob in cond_probs)
res

False

In [232]:
phoneme_consonants = [i[0] for i in cmudict.phones() if not i[1] == ['vowel']]
phoneme_consonants

['B',
 'CH',
 'D',
 'DH',
 'F',
 'G',
 'HH',
 'JH',
 'K',
 'L',
 'M',
 'N',
 'NG',
 'P',
 'R',
 'S',
 'SH',
 'T',
 'TH',
 'V',
 'W',
 'Y',
 'Z',
 'ZH']

In [233]:
all(p in phoneme_consonants for p in test_split)

False

In [234]:
test_split

['T', 'K', 'L', 'M', 'NG', 'ER']