In [1]:
from pyserini.index.lucene import IndexReader
import json

index_reader = IndexReader('indexes/lucene-index-msmarco-passage')
tf = index_reader.get_document_vector('7187158')
bm25_weights = \
    {term: index_reader.compute_bm25_term_weight('7187158', term, analyzer=None) \
     for term in tf.keys()}

print(json.dumps(bm25_weights, indent=4, sort_keys=True))

{
    "be": 2.637899875640869,
    "brother": 4.09124231338501,
    "bubba": 7.102361679077148,
    "bubba's\u00e2": 11.091651916503906,
    "deen": 7.4197235107421875,
    "earl": 5.663764953613281,
    "former": 3.8262834548950195,
    "gener": 2.2932770252227783,
    "her": 2.7393782138824463,
    "hier": 8.24051284790039,
    "manag": 2.832794189453125,
    "paula": 6.438521862030029,
    "su": 5.404428005218506,
    "uncl": 5.362298488616943,
    "w": 3.9339818954467773
}


In [2]:
from pyserini.analysis import Analyzer, get_lucene_analyzer

analyzer = Analyzer(get_lucene_analyzer())
query_tokens = analyzer.analyze('what is paula deen\'s brother')
multihot_query_weights = {k: 1 for k in query_tokens}

In [3]:
query_tokens

['what', 'paula', 'deen', 'brother']

In [4]:
multihot_query_weights

{'what': 1, 'paula': 1, 'deen': 1, 'brother': 1}

In [5]:
import numpy as np

# Gather up the dimensions (i.e., the combined dictionary).
terms = set.union(set(bm25_weights.keys()), set(multihot_query_weights.keys()))

bm25_vec = np.array([ bm25_weights.get(t, 0) for t in terms ])
multihot_qvec = np.array([ multihot_query_weights.get(t, 0) for t in terms ])

np.dot(multihot_qvec, bm25_vec)

17.949487686157227

In [6]:
sum({term: bm25_weights[term] \
     for term in bm25_weights.keys() & \
     multihot_query_weights.keys()}.values())

17.949487686157227

In [7]:
from pyserini.search.lucene import LuceneSearcher

searcher = LuceneSearcher('indexes/lucene-index-msmarco-passage')
hits = searcher.search('what is paula deen\'s brother')

for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid:7} {hits[i].score:.5f}')

 1 7187158 17.94950
 2 7187157 17.66560
 3 7187163 17.39060
 4 7546327 17.03410
 5 7187160 16.56520
 6 8227279 15.74180
 7 2298838 15.60820
 8 7617404 15.40040
 9 7187156 15.27550
10 2298839 14.97780


In [1]:
from pyserini.search.faiss import FaissSearcher, AutoQueryEncoder

encoder = AutoQueryEncoder('facebook/contriever-msmarco', device='cpu', pooling='mean')
searcher = FaissSearcher('indexes/faiss.nfcorpus.contriever-msmacro', encoder)
hits = searcher.search('How to Help Prevent Abdominal Aortic Aneurysms')

for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid:7} {hits[i].score:.6f}')
    

 1 MED-4555 1.472201
 2 MED-3180 1.125014
 3 MED-1309 1.067153
 4 MED-2224 1.059536
 5 MED-4423 1.038440
 6 MED-4887 1.032622
 7 MED-2530 1.020758
 8 MED-2372 1.016142
 9 MED-1006 1.013599
10 MED-2587 1.010811


In [2]:
import faiss

index = faiss.read_index('indexes/faiss.nfcorpus.contriever-msmacro/index')
num_vectors = index.ntotal

In [4]:
for i in range(num_vectors):
    vector = index.reconstruct(i)
    print(f"Vector {i}: {vector}")

Vector 0: [-3.71267349e-02 -3.88003439e-02 -2.59502623e-02 -4.85447161e-02
 -2.57052779e-02  2.79756133e-02  3.15171070e-02 -3.78646068e-02
  2.04621181e-02  5.30080907e-02 -1.13070235e-01  5.93898073e-03
 -2.58902516e-02 -2.72342041e-02 -5.88500220e-03  4.49550413e-02
  7.18338927e-03  1.03232689e-01 -5.70061170e-02  1.78443659e-02
 -1.81763284e-02  5.71612716e-02 -1.64714009e-02 -5.75788431e-02
 -2.65387110e-02 -8.80167335e-02 -6.04181103e-02 -1.22654311e-01
  6.50560856e-03  6.32748604e-02 -8.69280472e-02  1.34953901e-01
 -1.79489348e-02 -2.07273867e-02 -2.78854370e-03  6.89641759e-03
 -2.58775260e-02  9.10568759e-02 -4.67669368e-02 -2.90269451e-03
 -3.19594704e-02 -1.98259857e-02 -8.79268441e-03  3.46660241e-02
 -6.27628788e-02 -6.29161522e-02 -6.37166277e-02 -1.23599395e-01
 -1.16034215e-02  6.85494021e-02 -1.47189364e-01  1.50794182e-02
  4.42653485e-02 -6.33233562e-02 -6.66711666e-03 -1.08846312e-03
 -9.58894379e-03 -1.60400420e-02  1.78580526e-02  1.80137511e-02
  6.94414750e-0

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [5]:
docids = []
with open('indexes/faiss.nfcorpus.contriever-msmacro/docid', 'r') as fin:
    docids = [line.rstrip() for line in fin.readlines()]

v1 = index.reconstruct(docids.index('MED-4555'))

In [6]:
# This is the string contents of doc MED-4555
doc_text = 'Analysis of risk factors for abdominal aortic aneurysm in a cohort of more than 3 million individuals. BACKGROUND: Abdominal aortic aneurysm (AAA) disease is an insidious condition with an 85% chance of death after rupture. Ultrasound screening can reduce mortality, but its use is advocated only for a limited subset of the population at risk. METHODS: We used data from a retrospective cohort of 3.1 million patients who completed a medical and lifestyle questionnaire and were evaluated by ultrasound imaging for the presence of AAA by Life Line Screening in 2003 to 2008. Risk factors associated with AAA were identified using multivariable logistic regression analysis. RESULTS: We observed a positive association with increasing years of smoking and cigarettes smoked and a negative association with smoking cessation. Excess weight was associated with increased risk, whereas exercise and consumption of nuts, vegetables, and fruits were associated with reduced risk. Blacks, Hispanics, and Asians had lower risk of AAA than whites and Native Americans. Well-known risk factors were reaffirmed, including male gender, age, family history, and cardiovascular disease. A predictive scoring system was created that identifies aneurysms more efficiently than current criteria and includes women, nonsmokers, and individuals aged <65 years. Using this model on national statistics of risk factors prevalence, we estimated 1.1 million AAAs in the United States, of which 569,000 are among women, nonsmokers, and individuals aged <65 years. CONCLUSIONS: Smoking cessation and a healthy lifestyle are associated with lower risk of AAA. We estimated that about half of the patients with AAA disease are not eligible for screening under current guidelines. We have created a high-yield screening algorithm that expands the target population for screening by including at-risk individuals not identified with existing screening criteria.'

from pyserini.encode import AutoDocumentEncoder
encoder = AutoDocumentEncoder('facebook/contriever-msmarco', device='cpu', pooling='mean')

v2 = encoder.encode(doc_text)

In [10]:
v2[0]

array([-1.15654320e-01, -3.10214758e-02, -4.09645401e-03,  2.19829939e-02,
       -7.59008676e-02,  8.16636756e-02, -6.72997087e-02,  4.92867529e-02,
        3.95880826e-02,  9.74194892e-03, -1.22898638e-01,  1.42303118e-02,
        2.26314589e-02, -3.63633479e-03,  1.19752228e-01,  1.29842497e-02,
       -1.54617243e-02,  8.28077570e-02, -2.11061165e-02, -4.76031378e-02,
       -4.78967577e-02,  9.33353603e-03, -1.70076285e-02, -2.05107369e-02,
        1.03752732e-01, -4.21517044e-02, -1.13575868e-01,  1.04648825e-02,
        2.21371688e-02,  1.01485141e-01, -8.67812559e-02,  1.30011395e-01,
       -1.23853356e-01,  4.62748408e-02, -3.26634571e-02, -2.38091275e-02,
       -3.37111466e-02,  6.00792319e-02, -3.96381840e-02,  5.08259516e-03,
       -5.64277060e-02,  6.25779778e-02, -1.11561313e-01, -7.51701966e-02,
        2.56778207e-02, -2.61058938e-02, -5.12458831e-02, -1.19577587e-01,
       -1.03980221e-01, -6.18074946e-02, -2.78474316e-02,  2.89081912e-02,
        5.56373447e-02, -

In [8]:
import numpy as np
np.linalg.norm(v2[0] - v1)

0.0

In [11]:
from pyserini.search.faiss import FaissSearcher, AutoQueryEncoder

encoder = AutoQueryEncoder('facebook/contriever-msmarco', device='cpu', pooling='mean')
searcher = FaissSearcher('indexes/faiss.nfcorpus.contriever-msmacro', encoder)
hits = searcher.search('How to Help Prevent Abdominal Aortic Aneurysms')

for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid:7} {hits[i].score:.6f}')

 1 MED-4555 1.472201
 2 MED-3180 1.125014
 3 MED-1309 1.067153
 4 MED-2224 1.059536
 5 MED-4423 1.038440
 6 MED-4887 1.032622
 7 MED-2530 1.020758
 8 MED-2372 1.016142
 9 MED-1006 1.013599
10 MED-2587 1.010811


In [12]:
from pyserini.encode import AutoQueryEncoder

q_encoder = AutoQueryEncoder('facebook/contriever-msmarco', device='cpu', pooling='mean')
q_vec = q_encoder.encode('How to Help Prevent Abdominal Aortic Aneurysms')

In [13]:
np.dot(q_vec, v1)

1.4722011

In [14]:
from tqdm import tqdm

scores = []
# Iterate through all document vectors and compute dot product.
for i in tqdm(range(num_vectors)):
    vector = index.reconstruct(i)
    score = np.dot(q_vec, vector)
    scores.append([docids[i], score])

# Sort by score descending.
scores.sort(key=lambda x: -x[1])

for s in scores[:10]:
    print(f'{s[0]} {s[1]:.6f}')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|███████████████████████████████████| 3633/3633 [00:00<00:00, 438816.60it/s]

MED-4555 1.472201
MED-3180 1.125014
MED-1309 1.067153
MED-2224 1.059536
MED-4423 1.038440
MED-4887 1.032622
MED-2530 1.020758
MED-2372 1.016142
MED-1006 1.013599
MED-2587 1.010811





In [15]:
import json

with open('collections/nfcorpus/pyserini-corpus/corpus.jsonl', 'w') as out:
    with open('collections/nfcorpus/corpus.jsonl', 'r') as f:
        for line in f:
            l = json.loads(line)
            s = json.dumps({'id': l['_id'], 'contents': l['title'] + ' ' + l['text']})
            out.write(s + '\n')

In [16]:
from pyserini.search.lucene import LuceneSearcher

searcher = LuceneSearcher('indexes/lucene.nfcorpus')
hits = searcher.search('How to Help Prevent Abdominal Aortic Aneurysms')

for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid:7} {hits[i].score:.4f}')

 1 MED-4555 11.9305
 2 MED-4423 8.4771
 3 MED-3180 7.1896
 4 MED-2718 6.0102
 5 MED-1309 5.8181
 6 MED-4424 5.7448
 7 MED-1705 5.6101
 8 MED-4902 5.3639
 9 MED-1009 5.2533
10 MED-1512 5.2068


In [17]:
from pyserini.index.lucene import IndexReader
import json

index_reader = IndexReader('indexes/lucene.nfcorpus')
tf = index_reader.get_document_vector('MED-4555')
bm25_weights = \
    {term: index_reader.compute_bm25_term_weight('MED-4555', term, analyzer=None) \
     for term in tf.keys()}

print(json.dumps(bm25_weights, indent=4, sort_keys=True))

{
    "1.1": 2.2132339477539062,
    "2003": 2.0071027278900146,
    "2008": 1.9285697937011719,
    "3": 0.8006289005279541,
    "3.1": 2.387873649597168,
    "569,000": 3.995551586151123,
    "65": 2.5261287689208984,
    "85": 2.2027695178985596,
    "aaa": 6.861149311065674,
    "abdomin": 2.810744047164917,
    "about": 1.2124857902526855,
    "advoc": 2.9084181785583496,
    "after": 0.6753702163696289,
    "ag": 0.9848723411560059,
    "algorithm": 2.9084181785583496,
    "american": 1.5901331901550293,
    "among": 0.8441131114959717,
    "analysi": 1.1517744064331055,
    "aneurysm": 5.2744975090026855,
    "aortic": 3.8452444076538086,
    "asian": 2.3056604862213135,
    "associ": 0.7863264083862305,
    "background": 0.8268404006958008,
    "black": 1.860482931137085,
    "can": 0.8828715085983276,
    "cardiovascular": 1.1131560802459717,
    "cessat": 3.4776077270507812,
    "chanc": 2.563375949859619,
    "cigarett": 2.2459728717803955,
    "cohort": 1.5951440334320068,


In [18]:
from pyserini.analysis import Analyzer, get_lucene_analyzer

analyzer = Analyzer(get_lucene_analyzer())
query_tokens = analyzer.analyze('How to Help Prevent Abdominal Aortic Aneurysms')
multihot_query_weights = {k: 1 for k in query_tokens}

In [19]:
multihot_query_weights

{'how': 1, 'help': 1, 'prevent': 1, 'abdomin': 1, 'aortic': 1, 'aneurysm': 1}

In [20]:
sum({term: bm25_weights[term] \
     for term in bm25_weights.keys() & \
     multihot_query_weights.keys()}.values())

11.930485963821411

In [21]:
def dot(q_weights, d_weights):
    return sum({term: d_weights[term] \
                for term in d_weights.keys() & \
                q_weights.keys()}.values())

dot(multihot_query_weights, bm25_weights)

11.930485963821411

In [22]:
from pyserini.search.lucene import LuceneSearcher
from pyserini.index.lucene import IndexReader
from tqdm import tqdm

searcher = LuceneSearcher('indexes/lucene.nfcorpus')
index_reader = IndexReader('indexes/lucene.nfcorpus')

scores = []
# Iterate through all docids in the index.
for i in tqdm(range(0, searcher.num_docs)):
    docid = searcher.doc(i).get('id')
    # Reconstruct the BM25 document vector.
    tf = index_reader.get_document_vector(docid)
    bm25_weights = \
        {term: index_reader.compute_bm25_term_weight(docid, term, analyzer=None) \
         for term in tf.keys()}
    # Compute and retain the query-document score.
    score = dot(multihot_query_weights, bm25_weights)
    scores.append([docid, score])

# Sort by score descending.
scores.sort(key=lambda x: -x[1])

for s in scores[:10]:
    print(f'{s[0]} {s[1]:.4f}')

100%|██████████████████████████████████████| 3633/3633 [00:06<00:00, 602.06it/s]

MED-4555 11.9305
MED-4423 8.4771
MED-3180 7.1896
MED-2718 6.0102
MED-1309 5.8181
MED-4424 5.7448
MED-1705 5.6101
MED-4902 5.3639
MED-1009 5.2533
MED-1512 5.2068



