In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from metaphone import doublemetaphone
import scipy.stats as stats

import nltk
nltk.download('wordnet')
nltk.download('cmudict')
from nltk.corpus import wordnet as wn
from nltk.corpus import cmudict

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abbyv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\abbyv\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


### Some Rambling

After much trial and error with simpler approaches like clustering and metaphones, I'm resolving to use pretrained phonetic embeddings.\
This team https://github.com/rahulsrma26/phonetic-word-embedding published a pretrained set of embeddings for the `cmudict` english phonetic dictionary.\
A scan of their paper suggests their basic approach was:
- Encode phonemes into Arpabet articulatory features -- (using phoneme spellings from the `cmudict`)
- Compute a custom Jaccard similarity score for bigrams of phoneme features that weights terminating vowel phonemes more highly (e.g. "marrY")
- Chain these bigram similarities into word similarity sums
- Compute a similarity matrix $M$ by computing this simliarity across all pairs of words
- Use SGD to learn word embeddings $V$ by minimizing loss $||M - VV^\intercal||^2$

**ABOUT THE LOSS**:\
This is pretty neat. For a dictionary of $d$ words and an embedding space of $m$ dimensions, $M \in \reals^{d\times d}$ and $V \in \reals^{d\times m}$. Thus, $VV^\intercal$ is actually a ***factorization*** of $M$. It gives us $m$-dimensional representations of each word that *preserve* the similarities we defined and computed in $M$. Indeed, $V$ could be derived with matrix-factorization since $M$ is positive symmetric. The authors use SGD instead since $M$ is prohibitively large for factorization.\
It is important to note as well that because we are effectively learning a factorization of the similarity matrix, the choice of the similarity scores used to comprise the matrix is *the most crucial factor* in making these embeddings meaningful. Not the loss function, or the model used for SGD.

I think using these embeddings locks my into the words in `cmudict`. While they provide code for their SGD implementation, I don't think I can learn new word embeddings via fine-tuning because I would need to recompute a new similarity matrix $M$ that includes those new words....


In [None]:
# using pretrained embeddings from: https://github.com/rahulsrma26/phonetic-word-embedding
# this team made an algorithm to compute similarities between the phonetic spelling of words from cmudict (M)
# then learnt embeddings for each word (V) by minimimizing ||M - V V'||^2 with SGD
# due to the nature of this loss function, I don't think we can add new 
texts = []
with open(r"C:\Users\abbyv\scripts\phonetic-word-embedding\vector_embeddings\simvecs", 'r') as file:
    for line in file:
        texts.append(line)    

words = np.array([texts[i].split()[0].lower() for i in range(len(texts))])
emb = np.array([texts[i].split()[1:] for i in range(len(texts))])

In [None]:
class WordWrapper:    
    mdf = get_mdf()
    meta_list = pd.unique(mdf['metaphone'])
    ctoi = set(list(''.join(meta_list)))
    ctoi = {c : (i * 10) + 1 for i,c in enumerate(ctoi)}  # encode numerically w/ monotonic integers
    itoc = {i:c for c,i in ctoi.items()}

class MetaMatcher(WordWrapper):
    """
    Find phonetically similar words to provided word based on cosine-similarity to metaphone encodings
    """

    @classmethod
    def one_hot(cls, c: str):
        return np.array([1 if char == c else 0 for char in cls.ctoi.keys()])
    
    @classmethod
    # def enc(cls, meta: str):
    #     res = np.array(list(meta)).reshape(-1, 1)
    #     res = np.apply_along_axis(lambda x: one_hot(x), axis=1, arr=res)
    #     res = np.pad(res, ((0, 10 - res.shape[0]), (0, 0)), 'constant', constant_values=0)
    #     return np.ravel(res)
    def enc(cls, meta): 
        res = np.array([cls.ctoi[c] for c in list(meta)])
        return np.pad(res, (0, 10 - len(res)), 'constant', constant_values=0)

    @classmethod
    def dec(cls, vector): 
        return ''.join([cls.itoc[i] for i in vector])

    @classmethod
    def vectorize(cls, meta_list):
        """Vectorize metaphones within each group"""
        res = np.array([cls.enc(m) for m in meta_list])
        return res

    def __init__(self, n: int = 10):
        super().__init__()
        self.n = n
        self.M = self.vectorize(self.meta_list)

    def get_similarities(self, meta: str):
        v = self.enc(meta)
        M_norms = np.apply_along_axis(lambda x: np.linalg.norm(x), axis=1, arr=self.M)
        return (v @ self.M.T) / (np.linalg.norm(v) * M_norms)

    def sample_similar(self, meta: str, n: int = 1):
        """Randomly sample a similar metaphone from the distribution of cosine similarites"""
        cs = self.get_similarities(meta)              # get cosine similarities
        cs *= np.exp(cs)  # try scaling
        probs = np.exp(cs) / np.sum(np.exp(cs))       # softmax over cosine-similarities
        rv = np.random.multinomial(1, probs, size=n)  # randomly sample n times, 1 at sampled indexes
        idx = np.apply_along_axis(lambda x: np.argmax(x), axis=1, arr=rv)  # get indexes of samples

        return self.meta_list[idx]

In [119]:
mm = MetaMatcher(n=10)
mm.sample_similar('ARTFRK', n=10)

array(['SKNNK', 'AKNTNT', 'KNTRXNJ', 'APTRTL', 'KRTSSMS', 'ARPPNK',
       'ALMNTTF', 'AXR', 'STKSR', 'FLNTT'], dtype=object)