<a href="https://colab.research.google.com/github/AlexanderCoudijzer/BM25-VSM-Search-Engine/blob/main/Search_engine_v0.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Search Engine Implementation
see https://ir-datasets.com/beir.html#beir/scifact  
and https://ir-datasets.com/python.html

## Data loading and preprocessing

In [1]:
%%capture
!pip install --upgrade ir_datasets

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import ir_datasets

In [2]:
dataset = ir_datasets.load("beir/scifact/train")

print(dataset.docs_count(), 'documents with:', dataset.docs_cls().__annotations__)
print(dataset.queries_count(), 'queries with:', dataset.queries_cls().__annotations__)
print(dataset.qrels_count(), 'qrels with:',dataset.qrels_cls().__annotations__)

5183 documents with: OrderedDict([('doc_id', <class 'str'>), ('text', <class 'str'>), ('title', <class 'str'>)])
809 queries with: OrderedDict([('query_id', <class 'str'>), ('text', <class 'str'>)])
919 qrels with: OrderedDict([('query_id', <class 'str'>), ('doc_id', <class 'str'>), ('relevance', <class 'int'>), ('iteration', <class 'str'>)])


In [3]:
%%capture
#this actually downloads the data, muted to avoid clutter
for doc in dataset.docs_iter()[:1]: None
for q in dataset.queries_iter(): None
for qrel in dataset.qrels_iter(): None

In [4]:
corpus=[]
for doc in dataset.docs_iter():
    corpus.append([doc[0],doc[1],doc[2]])

documents = [doc[1] for doc in corpus] 
# only taking the text for simplicity, but we'll need the doc_id for the evaluation

## Indexing framework

Removing all numbers, punctuation and excess whitespace. This can potentially be a part of the data preprocessing section.

In [5]:
import string
import re

def clean_text(text):
    # remove numbers
    text_nonum = re.sub(r'\d+', '', text)
    # remove punctuation and convert characters to lower case
    text_nopunct = "".join([char.lower() for char in text_nonum if char not in string.punctuation]) 
    # substitute multiple whitespace with single whitespace and remove leading and trailing whitespaces
    text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
    return text_no_doublespace

documents_cleaned = []

for d in documents:
    x = clean_text(d)
    documents_cleaned.append(x)

Tokenization:

In [6]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt') # I had to add this in to make word_tokenize work, not sure why

documents_tokenized = []

for d in documents_cleaned:
    x = word_tokenize(d)
    documents_tokenized.append(x)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Lemmatization:

In [7]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wordnet_lemmatizer = WordNetLemmatizer()

documents_lemmatized = []

for d in documents_tokenized:
    y = []
    for word in d:
        x = wordnet_lemmatizer.lemmatize(word)
        y.append(x)
    documents_lemmatized.append(y)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Stemming:

In [8]:
from nltk.stem import SnowballStemmer

snow_stemmer = SnowballStemmer(language='english')

documents_stemmed = []
        
for d in documents_lemmatized:
    stems = []
    for word in d:
        x = snow_stemmer.stem(word)
        stems.append(x)
    documents_stemmed.append(stems)

Converting back to full strings:

In [9]:
documents_indexed = []

for doc in documents_stemmed:
    documents_indexed.append(' '.join(doc))

Vectorizing (with included stop word removal):

In [10]:
vectorizer = CountVectorizer(stop_words='english', strip_accents='ascii')
documents_vectorized = vectorizer.fit_transform(documents_indexed)
vocabulary = vectorizer.get_feature_names_out()
dataframe = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
dataframe

Unnamed: 0,14,a142,aa,aaa,aaaatpas,aaafamili,aab,aabenhus,aacr,aacrthi,...,zygos,zygot,zymographi,zymosan,zymosaninduc,zyxin,zz,zzw,zzz,zzzw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Retrieval framework

### Query

In [11]:
query_num = 1 #query number to test
queries = []
for q in dataset.queries_iter(): # reading in all the queries
    queries.append([q[0],q[1]])
print([queries[query_num][1]])
q_vectorizer = CountVectorizer(stop_words='english', strip_accents='ascii')
q = q_vectorizer.fit_transform([queries[query_num][1]])
q_voc = q_vectorizer.get_feature_names_out()
print(q_voc)

q_vector =np.array([])
for t in dataframe.columns:
    if t in q_voc: q_vector = np.append(q_vector,1)
    else: q_vector = np.append(q_vector,0)

['1 in 5 million in UK have abnormal PrP positivity.']
['abnormal' 'million' 'positivity' 'prp' 'uk']


### VSM

In [12]:
dfs = (dataframe > 0).sum(axis=0) # doc frequency
N = dataframe.shape[0] # total number of docs
idfs = np.log10(N/dfs) # inverse doc frequency
doc_tfidf = np.array(dataframe * idfs)

In [13]:
from numpy.linalg import norm

VSM_scores =[]
for d in doc_tfidf:
    VSM_scores.append(np.dot(d, q_vector)/(norm(d)*norm(q_vector)))
sorted(zip(documents,VSM_scores), key = lambda tup:tup[1], reverse=True)[:10]

[('BACKGROUND The Global Burden of Diseases, Injuries, and Risk Factors Study 2015 provides an up-to-date synthesis of the evidence for risk factor exposure and the attributable burden of disease. By providing national and subnational assessments spanning the past 25 years, this study can inform debates on the importance of addressing risks in context. METHODS We used the comparative risk assessment framework developed for previous iterations of the Global Burden of Disease Study to estimate attributable deaths, disability-adjusted life-years (DALYs), and trends in exposure by age group, sex, year, and geography for 79 behavioural, environmental and occupational, and metabolic risks or clusters of risks from 1990 to 2015. This study included 388 risk-outcome pairs that met World Cancer Research Fund-defined criteria for convincing or probable evidence. We extracted relative risk and exposure estimates from randomised controlled trials, cohorts, pooled cohorts, household surveys, census

### BM25

In [14]:
# Defining all the variables we need:
k_1 = 1.2
b = 0.8
#dfs = (dataframe > 0).sum(axis=0) # doc frequency
#N = dataframe.shape[0] # total number of docs
#idfs = np.log10(N/dfs) # inverse doc frequency
dls = dataframe.sum(axis=1).tolist() # considering words minus stop words in doc (better option)
avgdl = np.mean(dls) # single value, mean doc length (minus stop words)

# Applying the BM25 formula:
numerator = np.array((k_1 + 1) * dataframe)
denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) + np.array(dataframe)
BM25_tf = numerator / denominator
idfs = np.array(idfs)
BM25_tfidf = BM25_tf * idfs

bm25_idf = pd.DataFrame(BM25_tfidf, columns=vocabulary)
del numerator
del denominator
del BM25_tf
del idfs
del BM25_tfidf
#bm25_idf

In [15]:
q_voc = [term for term in q_voc if term in bm25_idf.columns] #filtering out any terms not present in documents
BM25_scores = bm25_idf[q_voc].sum(axis=1)
sorted(zip(documents,BM25_scores.values), key = lambda tup:tup[1], reverse=True)[:10]

[('OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, and to understand better the implications for the management of blood and blood products and for the handling of surgical instruments. DESIGN Irreversibly unlinked and anonymised large scale survey of archived appendix samples. SETTING Archived appendix samples from the pathology departments of 41 UK hospitals participating in the earlier survey, and additional hospitals in regions with lower levels of participation in that survey. SAMPLE 32,441 archived appendix samples fixed in formalin and embedded in paraffin and tested for the presence of abnormal prion protein (PrP). RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevalence of 493 per m

### VSM-BM25 combo

In [16]:
BM25_VSM_scores =[]
for d in np.array(bm25_idf):
    BM25_VSM_scores.append(np.dot(d, q_vector)/(norm(d)*norm(q_vector)))
sorted(zip(documents,BM25_VSM_scores), key = lambda tup:tup[1], reverse=True)[:10]

[('OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, and to understand better the implications for the management of blood and blood products and for the handling of surgical instruments. DESIGN Irreversibly unlinked and anonymised large scale survey of archived appendix samples. SETTING Archived appendix samples from the pathology departments of 41 UK hospitals participating in the earlier survey, and additional hospitals in regions with lower levels of participation in that survey. SAMPLE 32,441 archived appendix samples fixed in formalin and embedded in paraffin and tested for the presence of abnormal prion protein (PrP). RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevalence of 493 per m

### Semantic Textual Similarity

In [35]:
import tensorflow as tf
import tensorflow_hub as hub

model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [23]:
embeddings = np.array(model(documents))

In [36]:
q_embed = np.array(model([queries[query_num][1]]))
STS_scores = np.dot(embeddings, q_embed.T)

In [37]:
sorted(zip(documents,list(STS_scores.flatten())), key = lambda tup:tup[1], reverse=True)[:10]

[("CONTEXT Maternal depressive symptoms during pregnancy have been reported in some, but not all, studies to be associated with an increased risk of preterm birth (PTB), low birth weight (LBW), and intrauterine growth restriction (IUGR). OBJECTIVE To estimate the risk of PTB, LBW, and IUGR associated with antenatal depression. DATA SOURCES AND STUDY SELECTION We searched for English-language and non-English-language articles via the MEDLINE, PsycINFO, CINAHL, Social Work Abstracts, Social Services Abstracts, and Dissertation Abstracts International databases (January 1980 through December 2009). We aimed to include prospective studies reporting data on antenatal depression and at least 1 adverse birth outcome: PTB (<37 weeks' gestation), LBW (<2500 g), or IUGR (<10th percentile for gestational age). Of 862 reviewed studies, 29 US-published and non-US-published studies met the selection criteria. DATA EXTRACTION Information was extracted on study characteristics, antenatal depression me

## Evaluation

In [None]:
qrels = []
for qrel in dataset.qrels_iter(): # reading in all the relevancy scores
    qrels.append(qrel)