### This is a notebook that generates a rhyming dictionnary file that we used in our rhyming score functions

In [1]:
import pickle
import os
import numpy as np

In [2]:
with open("phonetic_dictionary.txt", "r") as f:
    raw = f.read().split("\n")[:-1][64:-5] # 64- -5 to not get punctuation mark spellings
    
phonetic_dictionary = {line.split("\t")[0].lower(): line.split("\t")[1].lower().split() for line in raw}
print(len(list(phonetic_dictionary.items())))
print(list(phonetic_dictionary.items())[100:110])

# old format
# phonetic_dictionary = [[line.split("\t")[0].lower(), list(line.split("\t")[1].lower().split()), pos] for pos, line in enumerate(raw)]

132962
[('abdicate', ['ae', 'b', 'd', 'ah', 'k', 'ey', 't']), ('abdicated', ['ae', 'b', 'd', 'ah', 'k', 'ey', 't', 'ah', 'd']), ('abdicates', ['ae', 'b', 'd', 'ah', 'k', 'ey', 't', 's']), ('abdicating', ['ae', 'b', 'd', 'ih', 'k', 'ey', 't', 'ih', 'ng']), ('abdication', ['ae', 'b', 'd', 'ih', 'k', 'ey', 'sh', 'ah', 'n']), ('abdnor', ['ae', 'b', 'd', 'n', 'er']), ('abdo', ['ae', 'b', 'd', 'ow']), ('abdollah', ['ae', 'b', 'd', 'aa', 'l', 'ah']), ('abdomen', ['ae', 'b', 'd', 'ow', 'm', 'ah', 'n']), ('abdomen(2)', ['ae', 'b', 'd', 'ah', 'm', 'ah', 'n'])]


In [3]:
phonemic_vowels = ["AA","AE","AH","AO","AW","AY","EH","EY","IH","IY","OW","OY","UH","UW","W","Y"] + ["ER"]
# phonemic_consonants = ["B","CH","D","DH","F","G","HH","K","L","M","N","NG","JH","P","R","S","SH","T","TH","V","Z","ZH","SIL"]

In [4]:
# rhyme: "final group of vowels,
# optionally followed by a group of consonants, as well as 
# the group of consonants that precedes the group of vowels (Van de Cruys)

word2rhymes = {}

for key, phonemes in phonetic_dictionary.items():
    final_pos = 0
    for pos, phoneme in enumerate(phonemes):
        if phoneme.upper() in phonemic_vowels:
            final_pos = pos
    assonant_rhyme = phonemes[final_pos]
    perf_rhyme = "".join(phonemes[final_pos:])
    # non-perfect rhyme: check explicitly that previous phoneme is consonant
    # if phonemes[final_pos - 1] in phonemic_vowels:
    #     perf_rhyme = "".join(phonemes[final_pos:])
    # else:
    #     perf_rhyme = "".join(phonemes[final_pos-1:])

    word2rhymes[key] = [perf_rhyme, assonant_rhyme]

In [5]:
for key in np.random.choice(list(word2rhymes.keys()), 100):
    print(key, word2rhymes[key])

busch ['uhsh', 'uh']
alsdorf ['aorf', 'ao']
spotts ['aats', 'aa']
lambert's ['erts', 'er']
pickerel ['ahl', 'ah']
copycodes ['owdz', 'ow']
portability ['iy', 'iy']
walkington ['ahn', 'ah']
recreated ['ihd', 'ih']
coverdale ['eyl', 'ey']
funches ['ihz', 'ih']
faggot ['aht', 'ah']
nonacademic ['ihk', 'ih']
rinne ['ihn', 'ih']
canupp ['ahp', 'ah']
dimenaci ['iy', 'iy']
behead(2) ['ehd', 'eh']
athlone ['own', 'ow']
knocked ['aakt', 'aa']
wheaton ['ahn', 'ah']
tradeable ['ahl', 'ah']
franchised ['ayzd', 'ay']
discolored ['erd', 'er']
myrilla ['ah', 'ah']
beset ['eht', 'eh']
ransburg's ['ergz', 'er']
revives ['ayvz', 'ay']
whitted ['ihd', 'ih']
overextended ['ahd', 'ah']
inconstancy ['iy', 'iy']
conflict ['ihkt', 'ih']
medaphis ['ihs', 'ih']
salmi ['iy', 'iy']
sampre ['iy', 'iy']
kempson ['ahn', 'ah']
caution(2) ['ahn', 'ah']
exasperated ['ihd', 'ih']
residence ['ahns', 'ah']
jojoba ['ah', 'ah']
theocratic ['ihk', 'ih']
ikenberry ['iy', 'iy']
rine ['ayn', 'ay']
rudner ['er', 'er']
fauroux ['

In [6]:
# remove keys with parenthesis and numbers
print(len(word2rhymes))
word2rhymes = {key: val for key, val in word2rhymes.items() if "(" not in key and ")" not in key }
print(len(word2rhymes))

# remove keys with less than 3 characters
# word2rhymes = {key: val for key, val in word2rhymes.items() if len(key) >= 3}
# print(len(word2rhymes))

132962
123631


In [7]:
perf_rhyme = {}
assonant_rhyme = {}

# using loop to perform reverse mapping
for key, vals in word2rhymes.items(): 
    cons = vals[0]
    asson = vals[1]
    
    if cons in perf_rhyme.keys():
        perf_rhyme[cons] += [key]
    else:
        perf_rhyme[cons] = [key]

    if asson in assonant_rhyme.keys():
        assonant_rhyme[asson] += [key]
    else:
        assonant_rhyme[asson] = [key]

        
print(perf_rhyme["erz"][:10])
print(assonant_rhyme["ah"][:10])

['abductors', 'absorbers', 'abusers', 'accelerators', 'accelerometers', "accor's", 'accumulators', 'accusers', 'achievers', "acker's"]
['a', 'aachen', 'aamodt', 'aardema', 'aaron', "aaron's", 'aarons', 'aaronson', "aaronson's", 'aasen']


In [8]:
with open("../utils/data/rhyming_dictionaries.pickle", "wb") as f:
    pickle.dump([word2rhymes, perf_rhyme, assonant_rhyme], f, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
# with definition of rhyme as: "final group of vowels,
# optionally followed by a group of consonants, as well as 
# the group of consonants that precedes the group of vowels (Van de Cruys)
# we get the following amount of unique perf rhymes: 9522, and if we just forget about the preceding consonant we get: 1356.

print(len(perf_rhyme.keys()))
print(len(assonant_rhyme.keys()))

print(list(assonant_rhyme.keys()))
print(word2rhymes["girl"], word2rhymes["hurl"])

1356
19
['ah', 'ey', 'er', 'eh', 'ao', 'aa', 'iy', 'ae', 'ow', 'ih', 'aw', 'uw', 'ay', 'oy', 'uh', 'w', 'f', 'y', 'th']
['erl', 'er'] ['erl', 'er']
