<a href="https://colab.research.google.com/github/AlexanderCoudijzer/BM25-VSM-Search-Engine/blob/main/Search_engine_v0.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Search Engine Implementation
see https://ir-datasets.com/beir.html#beir/scifact  
and https://ir-datasets.com/python.html

## Data loading and preprocessing

In [1]:
%%capture
#!pip install --upgrade ir_datasets

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import ir_datasets
import string
import re
import unidecode
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

In [2]:
dataset = ir_datasets.load("beir/scifact/train")

print(dataset.docs_count(), 'documents with:', dataset.docs_cls().__annotations__)
print(dataset.queries_count(), 'queries with:', dataset.queries_cls().__annotations__)
print(dataset.qrels_count(), 'qrels with:',dataset.qrels_cls().__annotations__)

5183 documents with: {'doc_id': <class 'str'>, 'text': <class 'str'>, 'title': <class 'str'>}
809 queries with: {'query_id': <class 'str'>, 'text': <class 'str'>}
919 qrels with: {'query_id': <class 'str'>, 'doc_id': <class 'str'>, 'relevance': <class 'int'>, 'iteration': <class 'str'>}


In [3]:
%%capture
#this actually downloads the data, muted to avoid clutter
for doc in dataset.docs_iter()[:1]: None
for q in dataset.queries_iter(): None
for qrel in dataset.qrels_iter(): None

In [4]:
corpus=[]
for doc in dataset.docs_iter():
    corpus.append([doc[0],doc[1],doc[2]])

documents = [doc[1] for doc in corpus] 
# only taking the text for simplicity, but we'll need the doc_id for the evaluation

## Indexing framework

In [5]:
def indexing_framework(docs):
    
    def clean_text(text):
        # remove numbers
        text_nonum = re.sub(r'\d+', '', text)
        # remove punctuation and convert characters to lower case
        text_nopunct = "".join([char.lower() for char in text_nonum if char not in string.punctuation]) 
        # substitute multiple whitespace with single whitespace and remove leading and trailing whitespaces
        text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
        return text_no_doublespace
    
    documents_cleaned = []   # Cleaning

    for d in docs:
        x = clean_text(d)
        documents_cleaned.append(x)
    
    documents_tokenized = []   # Tokenization

    for d in documents_cleaned:
        x = word_tokenize(d)
        documents_tokenized.append(x)
    
    documents_sw_removed = []

    stop_words = set(stopwords.words('english'))

    for a in documents_tokenized:
        filtered_sentence = [w for w in a if not w in stop_words]
        filtered_sentence = [] # Stop word removal

        for w in a:  
            if w not in stop_words:  
                filtered_sentence.append(w)
        documents_sw_removed.append(filtered_sentence)
    
    documents_lemmatized = []   # Lemmatization
    
    wordnet_lemmatizer = WordNetLemmatizer()

    for d in documents_sw_removed:
        y = []
        for word in d:
            x = wordnet_lemmatizer.lemmatize(word)
            y.append(x)
        documents_lemmatized.append(y)
    
    snow_stemmer = SnowballStemmer(language='english')

    documents_stemmed = []   # Stemming
        
    for d in documents_lemmatized:
        stems = []
        for word in d:
            x = snow_stemmer.stem(word)
            stems.append(x)
        documents_stemmed.append(stems)
    
    documents_decoded = [] # Accent stripping
    
    for d in documents_stemmed:
        a = []
        for word in d:
            x = unidecode.unidecode(word)
            a.append(x)
        documents_decoded.append(a)
    
    documents_indexed = []   # Final formatting

    for doc in documents_decoded:
        documents_indexed.append(' '.join(doc))
    return documents_indexed

In [6]:
documents_indexed = indexing_framework(documents)

In [7]:
print(documents_indexed[0])

alter architectur cerebr white matter develop human brain affect cortic develop result function disabl line scan diffusionweight magnet reson imag mri sequenc diffus tensor analysi appli measur appar diffus coeffici calcul relat anisotropi delin threedimension fiber architectur cerebr white matter preterm n fullterm infant n ass effect prematur cerebr white matter develop earli gestat preterm infant n studi second time term central white matter mean appar diffus coeffici wk high micromm decreas toward term micromm posterior limb intern capsul mean appar diffus coeffici time similar versus micromm relat anisotropi higher closer birth term greater absolut valu intern capsul central white matter preterm infant term show higher mean diffus coeffici central white matter versus micromm p lower relat anisotropi area compar fullterm infant white matter versus p intern capsul versus p nonmyelin fiber corpus callosum visibl diffus tensor mri earli wk fullterm preterm infant term show mark differ

In [8]:
queries = []
for q in dataset.queries_iter(): # reading in all the queries
    queries.append(q[1])

In [9]:
queries_indexed = indexing_framework(queries) # Indexing queries to allow retrieve_ranking function to work as query terms need to match with indexed document terms in the BM25 dataframe

### Vectorizing:

In [10]:
vectorizer = CountVectorizer() # taken out sw removal and accent stripping as these are now included in the indexing framework
documents_vectorized = vectorizer.fit_transform(documents_indexed)
#vocabulary = vectorizer.get_feature_names_out() ---- I was having issues with updating sklearn so I had to use a different function that has similar functionality
vocabulary = vectorizer.get_feature_names()
dataframe = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
dataframe

Unnamed: 0,42,aa,aaa,aaaatpas,aaafamili,aab,aabenhus,aacr,aacrthi,aactinin,...,zygos,zygot,zymographi,zymosan,zymosaninduc,zyxin,zz,zzw,zzz,zzzw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Retrieval framework

### Query

In [11]:
qid = []
for q in dataset.queries_iter():
    qid.append(q[0])
qid = [eval(x) for x in qid]  # Reading in query IDs and converting to int

In [12]:
queries = [list(a) for a in zip(qid, queries_indexed)]

In [13]:
query_num = 1 #query number to test
print([queries[query_num][1]])
q_vectorizer = CountVectorizer()
q = q_vectorizer.fit_transform([queries[query_num][1]])
#q_voc = q_vectorizer.get_feature_names_out()
q_voc = q_vectorizer.get_feature_names()
print(q_voc)

q_vector =np.array([])
for t in dataframe.columns:
    if t in q_voc: q_vector = np.append(q_vector,1)
    else: q_vector = np.append(q_vector,0)

['million uk abnorm prp posit']
['abnorm', 'million', 'posit', 'prp', 'uk']


In [14]:
documents_id=[]

for doc in corpus:
    documents_id.append([doc[0],doc[1]])

### VSM

In [15]:
dfs = (dataframe > 0).sum(axis=0) # doc frequency
N = dataframe.shape[0] # total number of docs
idfs = np.log10(N/dfs) # inverse doc frequency
doc_tfidf = np.array(dataframe * idfs)

In [16]:
from numpy.linalg import norm

VSM_scores = []
for d in doc_tfidf:
    VSM_scores.append(np.dot(d, q_vector)/(norm(d)*norm(q_vector)))
sorted(zip(documents_id,VSM_scores), key = lambda tup:tup[1], reverse=True)[:10]

[(['13734012',
   'OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, and to understand better the implications for the management of blood and blood products and for the handling of surgical instruments. DESIGN Irreversibly unlinked and anonymised large scale survey of archived appendix samples. SETTING Archived appendix samples from the pathology departments of 41 UK hospitals participating in the earlier survey, and additional hospitals in regions with lower levels of participation in that survey. SAMPLE 32,441 archived appendix samples fixed in formalin and embedded in paraffin and tested for the presence of abnormal prion protein (PrP). RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevale

### BM25

In [17]:
# Defining all the variables we need:
k_1 = 1.2
b = 0.8
#dfs = (dataframe > 0).sum(axis=0) # doc frequency
#N = dataframe.shape[0] # total number of docs
#idfs = np.log10(N/dfs) # inverse doc frequency
dls = dataframe.sum(axis=1).tolist() # considering words minus stop words in doc (better option)
avgdl = np.mean(dls) # single value, mean doc length (minus stop words)

# Applying the BM25 formula:
numerator = np.array((k_1 + 1) * dataframe)
denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) + np.array(dataframe)
BM25_tf = numerator / denominator
idfs = np.array(idfs)
BM25_tfidf = BM25_tf * idfs

bm25_idf = pd.DataFrame(BM25_tfidf, columns=vocabulary)
del numerator
del denominator
del BM25_tf
del idfs
del BM25_tfidf
#bm25_idf

In [18]:
bm25_idf

Unnamed: 0,42,aa,aaa,aaaatpas,aaafamili,aab,aabenhus,aacr,aacrthi,aactinin,...,zygos,zygot,zymographi,zymosan,zymosaninduc,zyxin,zz,zzw,zzz,zzzw
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
q_voc = [term for term in q_voc if term in bm25_idf.columns] #filtering out any terms not present in documents
BM25_scores = bm25_idf[q_voc].sum(axis=1)
sorted(zip(documents_id,BM25_scores.values), key = lambda tup:tup[1], reverse=True)[:10]

[(['13734012',
   'OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, and to understand better the implications for the management of blood and blood products and for the handling of surgical instruments. DESIGN Irreversibly unlinked and anonymised large scale survey of archived appendix samples. SETTING Archived appendix samples from the pathology departments of 41 UK hospitals participating in the earlier survey, and additional hospitals in regions with lower levels of participation in that survey. SAMPLE 32,441 archived appendix samples fixed in formalin and embedded in paraffin and tested for the presence of abnormal prion protein (PrP). RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevale

### VSM-BM25 combo

In [20]:
BM25_VSM_scores = []
for d in np.array(bm25_idf):
    BM25_VSM_scores.append(np.dot(d, q_vector)/(norm(d)*norm(q_vector)))
sorted(zip(documents_id,BM25_VSM_scores), key = lambda tup:tup[1], reverse=True)[:10]

[(['13734012',
   'OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, and to understand better the implications for the management of blood and blood products and for the handling of surgical instruments. DESIGN Irreversibly unlinked and anonymised large scale survey of archived appendix samples. SETTING Archived appendix samples from the pathology departments of 41 UK hospitals participating in the earlier survey, and additional hospitals in regions with lower levels of participation in that survey. SAMPLE 32,441 archived appendix samples fixed in formalin and embedded in paraffin and tested for the presence of abnormal prion protein (PrP). RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevale

### Semantic Textual Similarity

In [21]:
import tensorflow as tf
import tensorflow_hub as hub

model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [22]:
embeddings = np.array(model(documents))

In [23]:
q_embed = np.array(model([queries[query_num][1]]))
STS_scores = np.dot(embeddings, q_embed.T)

In [24]:
sorted(zip(documents_id,list(STS_scores.flatten())), key = lambda tup:tup[1], reverse=True)[:10]

[(['26045237',
   'BACKGROUND Transthyretin amyloidosis is caused by the deposition of hepatocyte-derived transthyretin amyloid in peripheral nerves and the heart. A therapeutic approach mediated by RNA interference (RNAi) could reduce the production of transthyretin. METHODS We identified a potent antitransthyretin small interfering RNA, which was encapsulated in two distinct first- and second-generation formulations of lipid nanoparticles, generating ALN-TTR01 and ALN-TTR02, respectively. Each formulation was studied in a single-dose, placebo-controlled phase 1 trial to assess safety and effect on transthyretin levels. We first evaluated ALN-TTR01 (at doses of 0.01 to 1.0 mg per kilogram of body weight) in 32 patients with transthyretin amyloidosis and then evaluated ALN-TTR02 (at doses of 0.01 to 0.5 mg per kilogram) in 17 healthy volunteers. RESULTS Rapid, dose-dependent, and durable lowering of transthyretin levels was observed in the two trials. At a dose of 1.0 mg per kilogram, 

## Evaluation

In [25]:
qrels = []
for qrel in dataset.qrels_iter(): # reading in all the relevancy scores
    qrels.append(qrel)

In [26]:
queries = dict(zip(qid,queries_indexed)) # Combining into a dict to emulate format of queries in Lab 4

In [27]:
print(queries[2])

million uk abnorm prp posit


In [28]:
def retrieve_ranking(query, bm25_idf):
    q_terms = query.split(' ')
    q_terms_only = bm25_idf[q_terms]
    score_q_d = q_terms_only.sum(axis=1)
    return sorted(zip(bm25_idf.index.values,score_q_d.values), key = lambda tup:tup[1], reverse=True)

In [29]:
doc_ranking = retrieve_ranking(queries[2], bm25_idf)
print(doc_ranking[:10])

[(2232, 13.009034499081798), (3863, 7.475744112398933), (155, 4.750421408142046), (2407, 4.739075033656577), (4283, 4.244058344228909), (928, 4.15352597592546), (2862, 4.084513275532066), (2240, 3.523894621714997), (1896, 3.4742632671355835), (71, 3.45598984622202)]


In [30]:
print(documents_id[2232]) # To verify with above 

['13734012', 'OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, and to understand better the implications for the management of blood and blood products and for the handling of surgical instruments. DESIGN Irreversibly unlinked and anonymised large scale survey of archived appendix samples. SETTING Archived appendix samples from the pathology departments of 41 UK hospitals participating in the earlier survey, and additional hospitals in regions with lower levels of participation in that survey. SAMPLE 32,441 archived appendix samples fixed in formalin and embedded in paraffin and tested for the presence of abnormal prion protein (PrP). RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevalence o

In [31]:
def precision_at_k(query_id, k=10):

    doc_ranking = retrieve_ranking(queries[query_id], bm25_idf)
    retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score

    TP = np.array([int((query_id, doc, 1) in qrels) for doc in retrieved]).sum()
    FP = np.array([int((query_id, doc, 0) in qrels) for doc in retrieved]).sum()

    precision = TP / (TP+FP)

    return TP, FP, precision

In [32]:
def f1_score_at_k(query_id, k=10):
  # calculate f_1 score
  # hint: you need to find TP's etc in a similar way to precision at k
    doc_ranking = retrieve_ranking(queries[query_id], bm25_idf)
    retrieved = [doc[0] for doc in doc_ranking[:k]] # take only the document id, rather than score
  
    TP, FP, precision = precision_at_k(query_id, k)
    relevant_docs = np.array(qrels)
    relevant_docs = relevant_docs[relevant_docs[:, 0] == query_id][:,2].sum()
    FN = relevant_docs - TP

    recall = TP / (TP+FN)
    f1 = (2 * precision * recall) / (precision + recall)
  
    return f1

In [33]:
for query_id, query in queries.items():
    y_score = np.array(sorted(retrieve_ranking(queries[query_id], bm25_idf)))[:,1]
    y_true = np.zeros(y_score.size)
    np_qrels = np.array(qrels)

    for data in np_qrels[np_qrels[:, 0] == query_id]:
        y_true[data[1]] = data[2]

    ndcg = ndcg_score(np.expand_dims(y_true,axis=0), np.expand_dims(y_score,axis=0), k=k)
    print(f'retrieved for {query} with NGCD@{k} of {ndcg}')

  for data in np_qrels[np_qrels[:, 0] == query_id]:


NameError: name 'ndcg_score' is not defined

In [None]:
# To retrieve and calculate accuracy metrics for each query lets loop over them
k = 10
for query_id, query in queries.items():
    tp, fp, precision = precision_at_k(query_id, k=k)
    f1_score = f1_score_at_k(query_id, k=k)
    print('retrieved query "{}" with Precision@{} = {} and F1-score = {}'.format(query, k, precision, f1_score))