# Search Engine Implementation
see https://ir-datasets.com/beir.html#beir/scifact  
and https://ir-datasets.com/python.html

## Data loading and preprocessing

In [1]:
%%capture
#!pip install --upgrade ir_datasets

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import ir_datasets

In [2]:
dataset = ir_datasets.load("beir/scifact/train")

print(dataset.docs_count(), 'documents with:', dataset.docs_cls().__annotations__)
print(dataset.queries_count(), 'queries with:', dataset.queries_cls().__annotations__)
print(dataset.qrels_count(), 'qrels with:',dataset.qrels_cls().__annotations__)

5183 documents with: {'doc_id': <class 'str'>, 'text': <class 'str'>, 'title': <class 'str'>}
809 queries with: {'query_id': <class 'str'>, 'text': <class 'str'>}
919 qrels with: {'query_id': <class 'str'>, 'doc_id': <class 'str'>, 'relevance': <class 'int'>, 'iteration': <class 'str'>}


In [3]:
#%%capture
#this actually downloads the data, muted to avoid clutter
#for doc in dataset.docs_iter()[:1]: 
#    None
#for q in dataset.queries_iter():
 #   None
#for qrel in dataset.qrels_iter():
 #   None

In [4]:
corpus=[]
for doc in dataset.docs_iter():
    corpus.append([doc[0],doc[1],doc[2]])

documents = [doc[1] for doc in corpus] 
# only taking the text for simplicity, but we'll need the doc_id for the evaluation

## Indexing framework

Removing all numbers, punctuation and excess whitespace. This can potentially be a part of the data preprocessing section.

In [5]:
import string
import re

def clean_text(text):
    # remove numbers
    text_nonum = re.sub(r'\d+', '', text)
    # remove punctuation and convert characters to lower case
    text_nopunct = "".join([char.lower() for char in text_nonum if char not in string.punctuation]) 
    # substitute multiple whitespace with single whitespace and remove leading and trailing whitespaces
    text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
    return text_no_doublespace

documents_cleaned = []

for d in documents:
    x = clean_text(d)
    documents_cleaned.append(x)

Tokenization:

In [6]:
from nltk.tokenize import word_tokenize

documents_tokenized = []

for d in documents_cleaned:
    x = word_tokenize(d)
    documents_tokenized.append(x)

Lemmatization:

In [8]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

documents_lemmatized = []

for d in documents_tokenized:
    y = []
    for word in d:
        x = wordnet_lemmatizer.lemmatize(word)
        y.append(x)
    documents_lemmatized.append(y)

Stemming:

In [9]:
from nltk.stem import SnowballStemmer

snow_stemmer = SnowballStemmer(language='english')

documents_stemmed = []
        
for d in documents_lemmatized:
    stems = []
    for word in d:
        x = snow_stemmer.stem(word)
        stems.append(x)
    documents_stemmed.append(stems)

Converting back to full strings:

In [10]:
documents_indexed = []

for doc in documents_stemmed:
    documents_indexed.append(' '.join(doc))

Vectorizing (with included stop word removal):

In [11]:
vectorizer = CountVectorizer(stop_words='english', strip_accents='ascii')
documents_vectorized = vectorizer.fit_transform(documents_indexed)
vocabulary = vectorizer.get_feature_names_out()
dataframe = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
dataframe

Unnamed: 0,14,a142,aa,aaa,aaaatpas,aaafamili,aab,aabenhus,aacr,aacrthi,...,zygos,zygot,zymographi,zymosan,zymosaninduc,zyxin,zz,zzw,zzz,zzzw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Retrieval framework

In [12]:
# Defining all the variables we need:
k_1 = 1.2
b = 0.8
dfs = (dataframe > 0).sum(axis=0) # doc frequency
N = dataframe.shape[0] # total number of docs
idfs = np.log10(N/dfs) # inverse doc frequency
#dls = [len(d.split(' ')) for d in documents] # considering all words in doc
dls = dataframe.sum(axis=1).tolist() # considering words minus stop words in doc (better option)
avgdl = np.mean(dls) # single value, mean doc length (minus stop words)

# Applying the BM25 formula:
numerator = np.array((k_1 + 1) * dataframe)
denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) + np.array(dataframe)
BM25_tf = numerator / denominator
idfs = np.array(idfs)
BM25_score = BM25_tf * idfs

bm25_idf = pd.DataFrame(BM25_score, columns=vocabulary)
bm25_idf

Unnamed: 0,14,a142,aa,aaa,aaaatpas,aaafamili,aab,aabenhus,aacr,aacrthi,...,zygos,zygot,zymographi,zymosan,zymosaninduc,zyxin,zz,zzw,zzz,zzzw
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
query_num = 1 #query number to test
queries = []
for q in dataset.queries_iter(): # reading in all the queries
    queries.append([q[0],q[1]])
print([queries[query_num][1]])

vectorizer_q = CountVectorizer(stop_words='english', strip_accents='ascii')
query_vectorized = vectorizer_q.fit_transform([queries[query_num][1]])
q_terms = vectorizer_q.get_feature_names_out()
q_terms

['1 in 5 million in UK have abnormal PrP positivity.']


array(['abnormal', 'million', 'positivity', 'prp', 'uk'], dtype=object)

In [14]:
q_terms = [term for term in q_terms if term in bm25_idf.columns]
q_terms_only_df = bm25_idf[q_terms]
score_q_d = q_terms_only_df.sum(axis=1)
documents = [doc[1] for doc in corpus]
sorted(zip(documents,score_q_d.values), key = lambda tup:tup[1], reverse=True)[:10]

[('OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, and to understand better the implications for the management of blood and blood products and for the handling of surgical instruments. DESIGN Irreversibly unlinked and anonymised large scale survey of archived appendix samples. SETTING Archived appendix samples from the pathology departments of 41 UK hospitals participating in the earlier survey, and additional hospitals in regions with lower levels of participation in that survey. SAMPLE 32,441 archived appendix samples fixed in formalin and embedded in paraffin and tested for the presence of abnormal prion protein (PrP). RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevalence of 493 per m

In [15]:
qrels = []
for qrel in dataset.qrels_iter(): # reading in all the relevancy scores
    qrels.append(qrel)