In [42]:
import panel as pn

In [43]:
#obtain embeddings for words A, B, C and compute embedding for D
from flair.data import Sentence

def compute_embedding_D(A,B,C,embedding):
    # make a sentence from the list of given words
    wordsABC_sentence = Sentence(' '.join([A, B, C]))
    #print(wordsABC_sentence)
    # create embedding of sentence
    embedding.embed(wordsABC_sentence)
    
    # embed each word separately
    A_embedded = wordsABC_sentence[0].embedding
    B_embedded = wordsABC_sentence[1].embedding
    C_embedded = wordsABC_sentence[2].embedding

    # derive embedding for D as a arithmetic operation of the known embeddings following the king - man = queen - woman analogy
    D_embedding = B_embedded + C_embedded - A_embedded
    
    return D_embedding.tolist()

In [44]:
# obtain embeddings for all english words in Flair
from flair import datasets
from flair.data import Sentence

def get_embedding_english_vocab(embedding):
    # get collection of English sentences
    dataset = datasets.UD_ENGLISH()
    # make a vocabulary dictionaries which has the words as keys
    vocab_list = dataset.make_vocab_dictionary().get_items()
    # make vocab dictionary keys a sentence object
    vocab = Sentence(' '.join(vocab_list))
    # embed the sentence object
    embedding.embed(vocab)

    return vocab

In [45]:
# find the closests matching word
from sklearn.metrics.pairwise import cosine_similarity as sim

def find_closest_matching_word(D, vocab, ABC):
    max_match = -1
    for word in vocab:
        match = sim([D], [word.embedding.tolist()])[0][0]
        if match > max_match and word.text not in ABC:
            max_match = match
            closest_matching_word = word.text
    
    return closest_matching_word

In [46]:
word_A = pn.widgets.TextInput(name='word A', placeholder='Enter a string here...')
word_B = pn.widgets.TextInput(name='word B', placeholder='Enter a string here...')
word_C = pn.widgets.TextInput(name='word C', placeholder='Enter a string here...')

In [67]:
from flair.embeddings import WordEmbeddings

@pn.depends(word_A, word_B, word_C)
def solve_analogies(A, B, C):
    fasttext = WordEmbeddings('crawl')
    result = compute_embedding_D(A, B, C, fasttext)
    vocab = get_embedding_english_vocab(fasttext)
    D = find_closest_matching_word(result, vocab, {A, B, C})
    #D = "bla"

    return f'**{A}** is to **{B}** as **{C}** is to **{D}**'

anal_solv = pn.Row(solve_analogies)

2023-01-31 12:01:43,332 Reading data from /homes/cbecht/.flair/datasets/ud_english
2023-01-31 12:01:43,333 Train: /homes/cbecht/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2023-01-31 12:01:43,334 Dev: /homes/cbecht/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2023-01-31 12:01:43,335 Test: /homes/cbecht/.flair/datasets/ud_english/en_ewt-ud-test.conllu


In [75]:
sidebar1 = pn.pane.Markdown("""
#### Selectors for the Analogy solver
""", width=250, margin=(25))

Analogy_solver = pn.pane.Markdown("""
# Analogy Solver
All the language models we use up to date are based on word (or document) embeddings. The Hello World of Natural Language Processing is the equation king - man ~ queen - woman.
This analogy can be used to make the following statement: Word is to word B as word C is to word D. The following tool allows to enter the words A, B, and C to derive the closest word which is word D.
------------
""", width=750, margin=(25))

In [76]:
dashboard = pn.template.BootstrapTemplate(title = 'Natural Language Processing Playground', 
                                          sidebar_width = 250, 
                                          header_background = '#80b1d3')
dashboard.sidebar.append(sidebar1)
dashboard.sidebar.append(word_A)
dashboard.sidebar.append(word_B)
dashboard.sidebar.append(word_C)
dashboard.main.append(pn.Row(Analogy_solver, background='WhiteSmoke'))
dashboard.main.append(pn.Row(anal_solv))
dashboard.show()

Launching server at http://localhost:33061


<bokeh.server.server.Server at 0x7f2156ffc610>