In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import cmudict
import os

In [2]:
def read_matrix(matrix_name):
    matrix_path = 'matrices/' + matrix_name
    return pd.read_csv(matrix_path, index_col=0, na_values="null").dropna()

In [3]:
def normalize_matrix(dataset):
    min_val = dataset.min().min()
    max_val = dataset.max().max()
    matrix_norm=(dataset - min_val)/(max_val - min_val)
    return matrix_norm

In [4]:
consonant_dist = read_matrix('bailey_consonants.csv') #Allows for alternative matrices
consonant_dist = normalize_matrix(consonant_dist)
consonant_dist.head()

Unnamed: 0,P,B,F,V,M,W,TH,DH,T,D,...,R,CH,JH,SH,ZH,Y,K,G,NG,HH
P,0.0,0.25,0.25,0.5,0.75,0.75,0.5,0.75,0.25,0.5,...,1.0,0.5,0.75,0.5,0.75,1.0,0.25,0.5,1.0,0.5
B,0.25,0.0,0.5,0.25,0.5,0.5,0.75,0.5,0.5,0.25,...,0.75,0.75,0.5,0.75,0.5,0.75,0.5,0.25,0.75,0.75
F,0.25,0.5,0.0,0.25,0.75,0.75,0.25,0.5,0.5,0.75,...,1.0,0.5,0.75,0.25,0.5,1.0,0.5,0.75,1.0,0.25
V,0.5,0.25,0.25,0.0,0.5,0.5,0.5,0.25,0.75,0.5,...,0.75,0.75,0.5,0.5,0.25,0.75,0.75,0.5,0.75,0.5
M,0.75,0.5,0.75,0.5,0.0,0.25,1.0,0.75,1.0,0.75,...,0.5,1.0,0.75,1.0,0.75,0.5,1.0,0.75,0.25,1.0


In [5]:
# Append vowels to consonant matrix
vowel_dist = read_matrix('bailey_vowels.csv')
vowel_dist = normalize_matrix(vowel_dist) * 0.5
vowel_dist

Unnamed: 0,AA,AE,AH,AO,AW,AY,EH,ER,EY,IH,IY,OW,OY,UH,UW
AA,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
AE,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
AH,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
AO,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
AW,0.5,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
AY,0.5,0.5,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
EH,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
ER,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5
EY,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5
IH,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5


In [6]:
gap_penalty = 0.5

In [7]:
phoneme_dist = consonant_dist.append(vowel_dist, sort=False)
phoneme_dist = phoneme_dist.fillna(1.25)

# Add gap penalty columns:
#phoneme_dist['-'] = gap_penalty

#Add gap penality row
# gap_col = [gap_penalty] * len(phoneme_dist.columns)
# phoneme_dist.loc['-'] = gap_col
phoneme_dist.round(2)

Unnamed: 0,P,B,F,V,M,W,TH,DH,T,D,...,AY,EH,ER,EY,IH,IY,OW,OY,UH,UW
P,0.0,0.25,0.25,0.5,0.75,0.75,0.5,0.75,0.25,0.5,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25
B,0.25,0.0,0.5,0.25,0.5,0.5,0.75,0.5,0.5,0.25,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25
F,0.25,0.5,0.0,0.25,0.75,0.75,0.25,0.5,0.5,0.75,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25
V,0.5,0.25,0.25,0.0,0.5,0.5,0.5,0.25,0.75,0.5,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25
M,0.75,0.5,0.75,0.5,0.0,0.25,1.0,0.75,1.0,0.75,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25
W,0.75,0.5,0.75,0.5,0.25,0.0,1.0,0.75,1.0,0.75,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25
TH,0.5,0.75,0.25,0.5,1.0,1.0,0.0,0.25,0.5,0.75,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25
DH,0.75,0.5,0.5,0.25,0.75,0.75,0.25,0.0,0.75,0.5,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25
T,0.25,0.5,0.5,0.75,1.0,1.0,0.5,0.75,0.0,0.25,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25
D,0.5,0.25,0.75,0.5,0.75,0.75,0.75,0.5,0.25,0.0,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25


## NEEDLEMAN WUNSCH

In [13]:
vowels  = ['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH',\
           'ER', 'EY', 'IH', 'IY', 'OW', 'OY', 'UH', 'UW'] 

def get_phoneme_distance(alpha, beta):
    if alpha == beta:
        return 0
    elif alpha=="-" or beta == "-":
        return gap_penalty
    else:
        return phoneme_dist[alpha][beta]

def compute_dist(align1, align2):
    align1.reverse()    
    align2.reverse()
    
    score = 0
    for i in range(0,len(align1)):
        # if two AAs are the same, then output the letter
        score += get_phoneme_distance(align1[i], align2[i])                    
    return score, align1, align2


def needle(seq1, seq2):
    m, n = len(seq1), len(seq2)  # length of two sequences
    
    # Generate score table
    score = np.zeros((m+1, n+1))
    
    for i in range(0, m + 1):
        score[i][0] = gap_penalty * i
    for j in range(0, n + 1):
        score[0][j] = gap_penalty * j
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            match = score[i - 1][j - 1] + get_phoneme_distance(seq1[i-1], seq2[j-1])
            delete = score[i - 1][j] + gap_penalty
            insert = score[i][j - 1] + gap_penalty
            score[i][j] = min(match, delete, insert)

    # Traceback and compute the alignment 
    align1, align2 = [], []
    i,j = m,n # start from the bottom right cell
    while i > 0 and j > 0: # end toching the top or the left edge
        score_current = score[i][j]
        score_diagonal = score[i-1][j-1]
        score_up = score[i][j-1]
        score_left = score[i-1][j]
        
        # Align Vowels
#         if (seq1[i-1] in vowels and seq2[j-1] not in vowels):
#             align1.append('-')
#             align2.append(seq2[j-1])
#             j -= 1
#         elif (seq1[i-1] not in vowels and seq2[j-1] in vowels):
#             align1.append(seq1[i-1])
#             align2.append('-')
#             i -= 1
        
        if score_current == score_diagonal + get_phoneme_distance(seq1[i-1], seq2[j-1]):
            align1.append(seq1[i-1])
            align2.append(seq2[j-1])      
            i -= 1
            j -= 1
        elif score_current == score_left + gap_penalty:
            align1.append(seq1[i-1])
            align2.append('-')
            i -= 1
        elif score_current == score_up + gap_penalty:
            align1.append('-')
            align2.append(seq2[j-1])
            j -= 1

    # Finish tracing up to the top left cell
    while i > 0:
        align1.append(seq1[i-1])
        align2.append('-')
        i -= 1
    while j > 0:
        align1.append('-')
        align2.append(seq2[j-1])
        j -= 1

    return compute_dist(align1, align2)

In [21]:
word1 = ["S", "T", "OW"]
word2 =["B", "EH", "S", "T"]
distance, seq1, seq2 = needle(word1, word2)
print(seq1, "\n", seq2)
print(f"Distance = {distance}")

['-', '-', 'S', 'T', 'OW'] 
 ['B', 'EH', 'S', 'T', '-']
Distance = 1.5


In [20]:
word1 = ["K", "R", "AA"]
word2 =["K", "R", "EY"]  #rarity
distance, seq1, seq2 = needle(word1, word2)
print(seq1, "\n", seq2)
print(f"Distance = {distance}")

['K', 'R', 'AA'] 
 ['K', 'R', 'EY']
Distance = 0.5


In [17]:
# Get individual phoneme ditances
print(phoneme_dist["OW"]["ER"])

0.5


# Compare with RhymeZone

In [None]:
import datamuse

In [None]:
api = datamuse.Datamuse()

In [None]:
sim_rz = api.words(sl='rocket', max=2)
for entry in sim_rz:
    print(entry["word"],entry["score"])

In [None]:
def rhymezone_score(target, choice):
    # For target, get lits of similar word dictionaries (word, score, numSyllables)
    sim_rz = api.words(sl=target)
    yield next((i for i in sim_rz if i["word"] == choice))

In [None]:
next(rhymezone_score("bake", "bakes"))

## Compare by syllables:

In [None]:
import numpy as np

In [None]:
def compare_syllables(word1, word2):
    alignments = list(map(list, itertools.zip_longest(word1, word2, fillvalue=['-'])))
    total_dist = 0
    for s1,s2 in alignments:
        curr_dist, _, _ = needle(s1, s2)
        total_dist += curr_dist
    return total_dist

In [None]:
word1 = [["K", "AY", "T"], ["ER"]]
word2 = [["K", "AY", "T"]]
print(compare_syllables(word1, word2))

In [None]:
dd

In [None]:
t1= [["K"],["R"], ["T"]]
t2= [["ER"], ["AA"]]

for x in t1:
    for y in t2:
        print(x,y)

In [None]:
consonants