<a href="https://colab.research.google.com/github/AlexanderCoudijzer/IR_data_test/blob/main/Search_engine_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data loading testing
see https://ir-datasets.com/beir.html#beir/climate-fever  
and https://ir-datasets.com/python.html

In [None]:
%%capture
!pip install --upgrade ir_datasets

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import ir_datasets

## Climate-fever test

In [None]:
dataset = ir_datasets.load("beir/climate-fever")

In [None]:
dataset.has_docs(), dataset.has_queries(), dataset.has_qrels(), dataset.has_scoreddocs()

(True, True, True, False)

In [None]:
%%capture
for doc in dataset.docs_iter()[:1]:
    print(doc)

In [None]:
dataset.docs_cls().__annotations__

OrderedDict([('doc_id', str), ('text', str), ('title', str)])

In [None]:
for doc in dataset.docs_iter()[1000:1010]:
    print(doc)

BeirTitleDoc(doc_id='179th_Fighter_Squadron', text='The 179th Fighter Squadron ( 179 FS ) is a unit of the Minnesota Air National Guard 148th Fighter Wing located at Duluth Air National Guard Base , Minnesota . The 179th is equipped with the General Dynamics F-16C Fighting Falcon .', title='179th Fighter Squadron')
BeirTitleDoc(doc_id='1922_VFL_Grand_Final', text="The 1922 VFL Grand Final was an Australian rules football game contested between the Fitzroy Football Club and Collingwood Football Club , held at the Melbourne Cricket Ground in Melbourne on 21 October 1922 . It was the 26th annual Grand Final of the Victorian Football League , staged to determine the premiers for the 1922 VFL season .   The half-time break was more than thirty minutes .   The match , attended by 50,064 spectators , was won by Fitzroy by a margin of 11 points , marking that club 's seventh premiership victory .", title='1922 VFL Grand Final')
BeirTitleDoc(doc_id='11th_Infantry_Regiment_(Greece)', text="The 1

In [None]:
dataset.queries_count()
for q in dataset.queries_iter():
    print(q)

In [None]:
dataset.qrels_cls().__annotations__ 
dataset.qrels_defs() , dataset.qrels_count()

({}, 4681)

In [None]:
rel_doc = []
for qrel in dataset.qrels_iter():
    rel_doc.append(qrel[1])
len(set(rel_doc))

[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [3ms]


1344

In [None]:
corpus=[]
for doc in dataset.docs_iter():
    if doc[0] in rel_doc:
        corpus.append([doc[0],doc[1],doc[2]])

In [None]:
len(corpus)
#corpus

1344

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [None]:
vectorizer = CountVectorizer(stop_words='english', strip_accents='ascii')

documents_vectorized = vectorizer.fit_transform([doc[1] for doc in corpus])
vocabulary = vectorizer.get_feature_names_out()
vocabulary
dataframe = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
dataframe

Unnamed: 0,00,000,0005,001,003,005,006,007,01,010,...,zoogeographical,zoology,zoon,zoonotic,zooplankton,zoos,zooxanthellae,zooxanthellate,zosteraceae,zvezda
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1340,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1341,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1342,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
dfs = (dataframe > 0).sum(axis=0)
dfs

00                  4
000               173
0005                1
001                 1
003                 1
                 ... 
zoos                1
zooxanthellae       1
zooxanthellate      1
zosteraceae         1
zvezda              1
Length: 22638, dtype: int64

In [None]:
N = dataframe.shape[0]
idfs = np.log10(N/dfs)

# Lets first define all the variables we need:
k_1 = 1.2 # single value
b = 0.8 # single value
## considering all words in doc
dls = [len(d.split(' ')) for d in [doc[1] for doc in corpus]] # vector
## considering words minus stop words in doc. 
dls = dataframe.sum(axis=1).tolist()
avgdl = np.mean(dls) # single value

numerator = np.array((k_1 + 1) * dataframe)
denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) + np.array(dataframe)
BM25_tf = numerator / denominator
idfs = np.array(idfs)
BM25_score = BM25_tf * idfs

bm25_idf = pd.DataFrame(BM25_score, columns=vocabulary)
bm25_idf

Unnamed: 0,00,000,0005,001,003,005,006,007,01,010,...,zoogeographical,zoology,zoon,zoonotic,zooplankton,zoos,zooxanthellae,zooxanthellate,zosteraceae,zvezda
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.143564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.102436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1340,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1341,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1342,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
query_num = 23
queries = []
for q in dataset.queries_iter():
    queries.append([q[0],q[1]])
print([queries[query_num][1]])

vectorizer_q = CountVectorizer(stop_words='english', strip_accents='ascii')
query_vectorized = vectorizer_q.fit_transform([queries[query_num][1]])
q_terms = vectorizer_q.get_feature_names_out()
q_terms

['Earth’s mean temperature over the last 2,000 years shows two previous periods when temperatures were warmer than they are now; from 1–200 A.D., an epoch called the Roman Warm Period, and more recently the Medieval Warm Period from 900–1100 A.D.']


array(['000', '1200', '9001100', 'called', 'earths', 'epoch', 'mean',
       'medieval', 'period', 'periods', 'previous', 'recently', 'roman',
       'shows', 'temperature', 'temperatures', 'warm', 'warmer', 'years'],
      dtype=object)

In [None]:
q_terms = [term for term in q_terms if term in bm25_idf.columns]
q_terms_only_df = bm25_idf[q_terms]
score_q_d = q_terms_only_df.sum(axis=1)
documents = [doc[1] for doc in corpus]
sorted(zip(documents,score_q_d.values), key = lambda tup:tup[1], reverse=True)[:10]


[("For information on the description of the Medieval Warm Period and Little Ice Age in various IPCC reports see MWP and LIA in IPCC reportsThe temperature record of the past 1,000 years is reconstructed using data from climate proxy records in conjunction with the modern instrumental temperature record which only covers the last 150 years at a global scale . Large-scale reconstructions covering part or all of the 1st millennium and 2nd millennium have shown that recent temperatures are exceptional : the Intergovernmental Panel on Climate Change Fourth Assessment Report of 2007 concluded that `` Average Northern Hemisphere temperatures during the second half of the 20th century were very likely higher than during any other 50-year period in the last 500 years and likely '' the highest in at least the past 1,300 years . '' The curve shown in graphs of these reconstructions is widely known as the hockey stick graph because of the sharp increase in temperatures during the last century . A

## Scifact test

In [None]:
dataset = ir_datasets.load("beir/scifact/train")

In [None]:
%%capture
for doc in dataset.docs_iter()[:1]:
    print(doc)

In [None]:
dataset.docs_cls().__annotations__

OrderedDict([('doc_id', str), ('text', str), ('title', str)])

In [None]:
corpus=[]
for doc in dataset.docs_iter():
    corpus.append([doc[0],doc[1],doc[2]])
print(len(corpus))
documents = [doc[1] for doc in corpus]

5183


In [None]:
vectorizer = CountVectorizer(stop_words='english', strip_accents='ascii')
documents_vectorized = vectorizer.fit_transform(documents)
vocabulary = vectorizer.get_feature_names_out()
dataframe = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
dataframe

Unnamed: 0,00,000,0000,00001,00002,000035,00004,00005,00006,00008,...,zygosity,zygote,zygotes,zygotic,zymography,zymosan,zyxin,zz,zzw,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
dfs = (dataframe > 0).sum(axis=0)
N = dataframe.shape[0]
idfs = np.log10(N/dfs)

# Lets first define all the variables we need:
k_1 = 1.2 # single value
b = 0.8 # single value
## considering all words in doc
dls = [len(d.split(' ')) for d in documents] # vector
## considering words minus stop words in doc. 
dls = dataframe.sum(axis=1).tolist()
avgdl = np.mean(dls) # single value

numerator = np.array((k_1 + 1) * dataframe)
denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) + np.array(dataframe)
BM25_tf = numerator / denominator
idfs = np.array(idfs)
BM25_score = BM25_tf * idfs

bm25_idf = pd.DataFrame(BM25_score, columns=vocabulary)
bm25_idf

Unnamed: 0,00,000,0000,00001,00002,000035,00004,00005,00006,00008,...,zygosity,zygote,zygotes,zygotic,zymography,zymosan,zyxin,zz,zzw,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
query_num = 1
queries = []
for q in dataset.queries_iter():
    queries.append([q[0],q[1]])
print([queries[query_num][1]])

vectorizer_q = CountVectorizer(stop_words='english', strip_accents='ascii')
query_vectorized = vectorizer_q.fit_transform([queries[query_num][1]])
q_terms = vectorizer_q.get_feature_names_out()
q_terms

['1 in 5 million in UK have abnormal PrP positivity.']


array(['abnormal', 'million', 'positivity', 'prp', 'uk'], dtype=object)

In [None]:
q_terms = [term for term in q_terms if term in bm25_idf.columns]
q_terms_only_df = bm25_idf[q_terms]
score_q_d = q_terms_only_df.sum(axis=1)
documents = [doc[1] for doc in corpus]
sorted(zip(documents,score_q_d.values), key = lambda tup:tup[1], reverse=True)[:10]

[('OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, and to understand better the implications for the management of blood and blood products and for the handling of surgical instruments. DESIGN Irreversibly unlinked and anonymised large scale survey of archived appendix samples. SETTING Archived appendix samples from the pathology departments of 41 UK hospitals participating in the earlier survey, and additional hospitals in regions with lower levels of participation in that survey. SAMPLE 32,441 archived appendix samples fixed in formalin and embedded in paraffin and tested for the presence of abnormal prion protein (PrP). RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevalence of 493 per m