# Performing search with the built Whoosh and Annoy modules

After building the indices for our search engine(s), we can just read in the pickled objects. The obtained hit indices we can carry forward to another pipline, or render here by looking back into the corpus. 

In [None]:
# -*- coding: utf-8 -*-

import json, os, spacy, re, gensim, string, collections, pickle, sys, time
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim import corpora
import pandas as pd
import numpy as np

from pathos.helpers import cpu_count, freeze_support
from pathos.multiprocessing import ProcessingPool
from tqdm import tqdm

from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser, OrGroup, MultifieldParser

from annoy import AnnoyIndex




Don't forget to add Covid19 specific info to the spacy model.

In [None]:
spacy_nlp = spacy.load('en_core_sci_lg')
new_vector = spacy_nlp(
               """Positive-sense single‐stranded ribonucleic acid virus, subgenus 
                   sarbecovirus of the genus Betacoronavirus. 
                   Also known as severe acute respiratory syndrome coronavirus 2, 
                   also known by 2019 novel coronavirus. It is 
                   contagious in humans and is the cause of the ongoing pandemic of 
                   coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious 
                   disease.""").vector    
vector_data = {"COVID-19": new_vector,
               "2019-nCoV": new_vector,
               "SARS-CoV-2": new_vector}    
for word, vector in vector_data.items():
        spacy_nlp.vocab.set_vector(word, vector)


#spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
#cpu_number = cpu_count()

import warnings
warnings.filterwarnings("ignore")

Our current search engine model is actually twofold: a Whoosh word-based search and Annoy doc2vec-based nearest neighbour search. In this case we mix both result lists to obtain the final search hits...

In [None]:
#read in data
#search query input
search_query = 'covid19 heart diseases risks'
max_nb_docs = 10
#whoosh indexing
ix = pickle.load(open("ix_whoosh_doc.p", "rb"))
#transform the search query into its lemma forms
sq_nlp = spacy_nlp(search_query)
search_query_lemmas = ' '.join([x.lemma_ for x in sq_nlp])
whoosh_result = search_index(ix, search_query_lemmas, max_nb_docs)
#search query doc encoding from scispacy
search_query_vector = sq_nlp.vector
#annoy indexing
u = AnnoyIndex(200, 'angular')
u.load('semantic_search_doc.tree') 
annoy_results = u.get_nns_by_item(search_query_vector, max_nb_docs, search_k=10, include_distances=False)
#combination of results
print(whoosh_result)
print(annoy_results)