# Query Expansion
### Using FastText Word Embedding
Based on this paper: https://arxiv.org/pdf/1606.07608.pdf

Pre-made vector models: https://fasttext.cc/docs/en/aligned-vectors.html

In [18]:
from gensim.models import KeyedVectors
import numpy as np
# import natural language toolkit
from nltk.corpus   import stopwords
from nltk.tokenize import word_tokenize
# prepare stopword list
stop_words = stopwords.words('english')

In [2]:
wiki_en_align = '../../data/fasttext/wiki.en.align.vec'
# get fasttext wiki embeddings for english
wv_wiki_en = KeyedVectors.load_word2vec_format(wiki_en_align)
print('english words {}'.format(len(list(wv_wiki_en.vocab.keys()))))

english words 2519370


## Pre-retrieval kNN Based Approach

In [19]:
def tokenize(text, stopwords):
    """Tokenizes and removes stopwords from the document"""
    tokens = word_tokenize(text)
    filtered = [w.lower() for w in tokens if not w in stopwords]
    return filtered

In [20]:
def get_candidate_expansion_terms(tokens, k, wv):
    """Gets the candidate expansion terms"""
    candidates = set()
    for token in tokens:
        # check if the token is in the vocabulary
        if token in wv.vocab.keys():
            result = wv.similar_by_word(token)
            limit = k if len(result) > k else len(result)
            # iterate through the most similar words
            for i in range(limit):
                candidates.add(result[i][0])
    # return the candidates
    return candidates
        

In [21]:
candidates = get_candidate_expansion_terms(['deep', 'learning'], 5, wv_wiki_en)

In [36]:
def get_top_expansion_terms(tokens, candidates, k, wv):
    """Gets the actual expansion terms"""
    similarity_pairs = []
    for candidate in candidates:
        # calculate the similarity of the candidate to all tokens
        similarity = 0
        num_of_tokens = 0
        for token in tokens:
            # check if the token is in the vocabulary
            if token in wv.vocab.keys():
                num_of_tokens += 1
                similarity += wv.similarity(candidate, token)
        similarity_pairs.append((candidate, similarity / num_of_tokens))
    # return the list of expansion terms with their similarities
    return similarity_pairs

In [37]:
get_top_expansion_terms(['deep', 'learning'], candidates, 5, wv_wiki_en)

[('depths', 0.38366837799549103),
 ('learnings', 0.5210396274924278),
 ('learning—in', 0.5062408745288849),
 ('shallow', 0.37273794412612915),
 ('deeps', 0.40078432112932205),
 ('deeper', 0.4950985312461853),
 ('#learning', 0.5073417872190475),
 ('learning,', 0.5336430072784424),
 ('relearning', 0.5090842545032501),
 ('deepest', 0.42268526554107666)]

In [38]:
def pre_retrieval_KNN(string, k, wv):
    """Find the most similar tokens to the given query"""
    tokens = tokenize(string, stop_words)
    candidates = get_candidate_expansion_terms(tokens, k, wv)
    candidates_sim = get_top_expansion_terms(tokens, candidates, k, wv)
    def takeSecond(elem):
        return elem[1]
    sort = sorted(candidates_sim, key=takeSecond)
    return sort[:k]

In [39]:
pre_retrieval_KNN('deep learning', 5, wv_wiki_en)

[('shallow', 0.37273794412612915),
 ('depths', 0.38366837799549103),
 ('deeps', 0.40078432112932205),
 ('deepest', 0.42268526554107666),
 ('deeper', 0.4950985312461853)]