# Query Expansion
### Using FastText Word Embedding
Based on this paper: https://arxiv.org/pdf/1606.07608.pdf

Pre-made vector models: https://fasttext.cc/docs/en/aligned-vectors.html

In [1]:
from gensim.models import KeyedVectors
import numpy as np
# import natural language toolkit
from nltk.corpus   import stopwords
from nltk.tokenize import word_tokenize
# prepare stopword list
stop_words = stopwords.words('english')



In [3]:
import os
os.listdir()

['.ipynb_checkpoints',
 'document_embeddings.ipynb',
 'enviroLENS-deliverable-D4.2-images.ipynb',
 'query-expansion.ipynb']

In [4]:
wiki_en_align = '../../data/fasttext/wiki.en.align.vec'
# get fasttext wiki embeddings for english
wv_wiki_en = KeyedVectors.load_word2vec_format(wiki_en_align)
print('english words {}'.format(len(list(wv_wiki_en.vocab.keys()))))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


english words 2519370


## Pre-retrieval kNN Based Approach

In [5]:
#list of terms
def tokenize(text, stopwords):
    """Tokenizes and removes stopwords from the document"""
    tokens = word_tokenize(text)
    filtered = [w.lower() for w in tokens if not w in stopwords]
    return filtered

queen
3


In [32]:
#extended list of terms ###
def extend_tokens(token_list, wv):
    """Extends token list summing vector pairs"""
    tokens = []
    for token in token_list:
        # check if the token is in the vocabulary
        if token in wv.vocab.keys():
            tokens.append(token)
    extention = []
    for i in range(len(tokens)-1):
        new_token = wv_wiki_en.most_similar(positive=[tokens[i], tokens[i+1]])[0][0]
        extention.append(new_token)
    return(extention)

In [38]:
test = tokenize('water pollution underground', stop_words)
print(test)
ext = extend_tokens(test,wv_wiki_en)
print(ext)

['water', 'pollution', 'underground']
['pollutions', 'undergrounding']


In [6]:
# knn nearest
def get_candidate_expansion_terms(tokens, k, wv):
    """Gets the candidate expansion terms"""
    candidates = set()
    for token in tokens:
        # check if the token is in the vocabulary
        if token in wv.vocab.keys():
            result = wv.similar_by_word(token)
            limit = k if len(result) > k else len(result)
            # iterate through the most similar words
            for i in range(limit):
                candidates.add(result[i][0])
    # return the candidates
    return candidates
        

In [45]:
candidates = get_candidate_expansion_terms(test+ext, 5, wv_wiki_en)
print(candidates)
witout = get_candidate_expansion_terms(test, 5, wv_wiki_en)
print(without)

{'water—', '#pollution', 'potable', 'biopollution', 'undergrounders', 'sewage', 'seawater', 'pollutions', 'undergrounded', 'pollution', 'earpollution', 'pollution,', 'pollutants', 'undergrounder', 'groundwater', 'undergroung', 'undergrounds', 'undergrounding', 'undergroun'}
{'water—', 'undergroun', 'pollutions', 'groundwater', 'undergrounded', 'undergroung', 'earpollution', '#pollution', 'undergrounds', 'undergrounding', 'potable', 'biopollution', 'pollution,', 'sewage', 'seawater'}


In [8]:
# calculate similarity by angle
def get_top_expansion_terms(tokens, candidates, k, wv):
    """Gets the actual expansion terms"""
    similarity_pairs = []
    for candidate in candidates:
        # calculate the similarity of the candidate to all tokens
        similarity = 0
        num_of_tokens = 0
        for token in tokens:
            # check if the token is in the vocabulary
            if token in wv.vocab.keys():
                num_of_tokens += 1
                similarity += wv.similarity(candidate, token)
        similarity_pairs.append((candidate, similarity / num_of_tokens))
    # return the list of expansion terms with their similarities
    return similarity_pairs

In [46]:
top = get_top_expansion_terms(test+ext, candidates, 5, wv_wiki_en)
topwithout = get_top_expansion_terms(test, candidates, 5, wv_wiki_en)
def takeSecond(elem):
    return elem[1]
top = sorted(top, key=takeSecond)[::-1]
topw = sorted(topwithout, key=takeSecond)[::-1]
print(top)
print(topw)


[('pollution', 0.6260276407951794), ('pollutions', 0.5989783010567384), ('undergrounding', 0.5868486694404182), ('earpollution', 0.5791309410113681), ('pollution,', 0.5584437076938438), ('pollutants', 0.5473505877035749), ('groundwater', 0.5472185974670384), ('sewage', 0.5438533174483885), ('undergrounds', 0.5373294534380542), ('#pollution', 0.5307803259871704), ('biopollution', 0.5239432296935493), ('undergrounded', 0.5166503920961191), ('undergrounders', 0.5048834686641379), ('undergroun', 0.4968187167322256), ('undergrounder', 0.4930006599112927), ('undergroung', 0.48966882540846307), ('seawater', 0.48062588166948944), ('potable', 0.47213486261999227), ('water—', 0.43395093386930084)]
[('pollution', 0.6073097696558457), ('groundwater', 0.5722256140853779), ('sewage', 0.5672777675468184), ('earpollution', 0.5479508482970044), ('pollutions', 0.5392823009654661), ('pollution,', 0.5361565112564385), ('pollutants', 0.5332377192240293), ('seawater', 0.5219145986771884), ('undergrounding',

In [16]:
# all functions together, finds k nearest for each term, returns top n
def pre_retrieval_KNN(string, k, wv, n):
    """Find the most similar tokens to the given query"""
    tokens = tokenize(string, stop_words)
    candidates = get_candidate_expansion_terms(tokens, k, wv)
    candidates_sim = get_top_expansion_terms(tokens, candidates, k, wv)
    def takeSecond(elem):
        return elem[1]
    sort = sorted(candidates_sim, key=takeSecond)[::-1]
    return sort[:n]

In [17]:
pre_retrieval_KNN('deep learning', 5, wv_wiki_en, 10)

[('learning,', 0.5336429871558637),
 ('learnings', 0.5210396371333685),
 ('relearning', 0.509084270159704),
 ('#learning', 0.507341799782255),
 ('learning—in', 0.5062408798333075),
 ('deeper', 0.49509854997328506),
 ('deepest', 0.42268526313403443),
 ('deeps', 0.4007843076134384),
 ('depths', 0.3836683940147054),
 ('shallow', 0.3727379655276737)]

In [12]:
# import postgresql
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.library.postgresql import PostgresQL
# connect to the postgresql database
pg = PostgresQL() 
pg.connect(database="eurlex_environment_only", user="postgres", password="dbpass")

Error while connecting to PostgreSQL FATAL:  password authentication failed for user "postgres"

