In [1]:
import nltk
import os
from more_itertools import pairwise

In [2]:
arpabet = nltk.corpus.cmudict.dict()

In [3]:
def to_phoneme(input_word):
    """
    This function responds to a request for /api/words
    with the complete lists of words

    :param: input_word: word to convert to arpabet
    :returns:        string of translated word
    """
    # Generate list of phonemes for the word
    phones = []

    # Take first variation of phoneme representations
    phones_w = arpabet[input_word][0]
    for p in phones_w:
        # omits the numbers from phone labels
        new_p = p[:2]
        phones.append(new_p)

    return phones

In [4]:
def to_syllables(input_phoneme):
    # Phoneme sonority values
    syl_dic = {
        'AA': 11,
        'AE': 11,
        'AH': 11,
        'AO': 11,
        'AW': 11,
        'AY': 11,
        'EH': 11,
        'ER': 11,
        'EY': 11,
        'IH': 11,
        'IY': 11,
        'OW': 11,
        'OY': 11,
        'UH': 11,
        'UW': 11,
        'Y': 10,
        'W': 10,
        'R': 9,
        'L': 8,
        'M': 7,
        'N': 7,
        'NG': 7,
        'Z': 6,
        'ZH': 6,
        'V': 6,
        'DH': 6,
        'S': 5,
        'SH': 5,
        'F': 5,
        'TH': 5,
        'HH': 5,
        'JH': 4,
        'CH': 3,
        'B': 2,
        'D': 2,
        'G': 2,
        'P': 1,
        'T': 1,
        'K': 1
    }

    # Split phoneme lists into syllables
    syllables = []
    phones = []
    phones.append(input_phoneme)
    for word in phones:
        boundary = 0
        for i in range(1, len(word)):
            if ((word[i] == 'K'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8)
                or (word[i] == 'T'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9)
                or (word[i] == 'P'
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and word[i+1] != 'W')
                or (word[i] == 'B'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and word[i+1] != 'W')
                or (word[i] == 'G'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8
                    and not(word[i+1] in ['W', 'Y']))
                or (word[i] == 'D'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9)
                or (word[i] in ['CH', 'JH', 'HH', 'SH', 'DH', 'ZH', 'Z']
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'TH'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 9
                    and word[i+1] != 'Y')
                or (word[i] in ['F', 'V']
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 8)
                or (word[i] in ['N', 'M']
                    and word[i-1] != 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'L'
                    and not(word[i-1]
                    in ['K', 'P', 'G', 'B', 'F', 'S', 'V'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'R'
                    and not(word[i-1]
                    in ['K', 'T', 'P', 'G', 'D', 'B', 'F', 'V'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'W'
                    and not (word[i-1] in ['K', 'T', 'D', 'TH', 'DH'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'Y'
                    and not (word[i-1] in ['K', 'P', 'F', 'V', 'B'])
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] == 11)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and syl_dic[word[i+1]] >= 7)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and word[i+1] == 'T'
                    and i+2 < len(word)
                    and syl_dic[word[i+2]] >= 9)
                or (word[i] == 'S'
                    and i+1 < len(word)
                    and (word[i+1] == 'K' or word[i+1] == 'P')
                    and i+2 < len(word) and syl_dic[word[i+2]] >= 8)
                or (syl_dic[word[i]] == 11
                    and (syl_dic[word[i-1]] == 11 or word[i-1] == 'NG'))):
                syllables.append(word[boundary:i])
                boundary = i

        syllables.append(word[boundary:])
        return syllables


In [5]:
fname = "syllables.txt"
if os.path.exists(fname):
    os.remove(fname)

In [6]:
unique_sylls = [[]]

In [7]:
with open(fname, "w+") as f:
    for word in arpabet:
        phoneme_word = to_phoneme(word)
        syllables_list = to_syllables(phoneme_word)
        for _syll in syllables_list:
            if _syll not in unique_sylls:
                unique_sylls.append(_syll)
                for _phoneme in _syll:
                        f.write(_phoneme + " ")
                f.write("\n")

In [26]:
#Find obsure phoneme pairs:
for word in arpabet:
    phoneme_word = to_phoneme(word)
    syllables_list = to_syllables(phoneme_word)
    for syll in syllables_list:
        for p1,p2 in pairwise(syll):
            if p1=="T" and p2=="L":
                print(syllables_list)

[['D', 'AY'], ['AH', 'T', 'L']]


## Create markov model for training using syllables file

In [9]:
import math
import pickle
import cmudict

In [10]:
accepted_chars = [i[0] for i in cmudict.phones()]
#accepted_chars

In [11]:
pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])
pos

{'AA': 0,
 'AE': 1,
 'AH': 2,
 'AO': 3,
 'AW': 4,
 'AY': 5,
 'B': 6,
 'CH': 7,
 'D': 8,
 'DH': 9,
 'EH': 10,
 'ER': 11,
 'EY': 12,
 'F': 13,
 'G': 14,
 'HH': 15,
 'IH': 16,
 'IY': 17,
 'JH': 18,
 'K': 19,
 'L': 20,
 'M': 21,
 'N': 22,
 'NG': 23,
 'OW': 24,
 'OY': 25,
 'P': 26,
 'R': 27,
 'S': 28,
 'SH': 29,
 'T': 30,
 'TH': 31,
 'UH': 32,
 'UW': 33,
 'V': 34,
 'W': 35,
 'Y': 36,
 'Z': 37,
 'ZH': 38}

In [12]:
def normalize(line):
    """ Return only the subset of chars from accepted_chars.
    This helps keep the  model relatively small by ignoring punctuation,
    infrequenty symbols, etc. """
    return [c.lower() for c in line if c.lower() in accepted_chars]


In [34]:
def ngram(n, l):
    """ Return all n grams from l after normalizing """
    print("REST")
    filtered = normalize(l)
    for start in range(0, len(filtered) - n + 1):
        yield ''.join(filtered[start:start + n])

In [31]:
def avg_transition_prob(l, log_prob_mat):
    """ Return the average transition prob from l through log_prob_mat. """
    log_prob = 0.0
    transition_ct = 0
    for a, b in ngram(2, l):
        log_prob += log_prob_mat[pos[a]][pos[b]]
        transition_ct += 1
    # The exponentiation translates from log probs to probs.
    return math.exp(log_prob / (transition_ct or 1))

In [32]:
k = len(accepted_chars)
counts = [[2 for i in range(k)] for i in range(k)]

In [36]:
ngram(2, "LINE")

<generator object ngram at 0x0000024A80488C00>

In [16]:
# Count transitions from big text file, taken
for line in open('syllables.txt'):
    syllables = line.split()
    for a, b in ngram(2, syllables):
        counts[pos[a]][pos[b]] += 1

In [17]:
for i, row in enumerate(counts):
    s = float(sum(row))
    for j in range(len(row)):
        row[j] = math.log(row[j] / s)

In [18]:
thresh=0.018782003473122023

In [19]:
pickle.dump({'mat': counts, 'thresh': thresh}, open('phon_model.pki', 'wb'))

In [20]:
model_data = pickle.load(open('.pki', 'rb'))

In [23]:
word = input()
print(avg_transition_prob(word, counts) > thresh)

ers
1.0
