In [1]:
import faiss
index = faiss.read_index('indexes/nfcorpus.bge-base-en-v1.5/index')
num_vectors = index.ntotal

In [2]:
for i in range(10):
    vector = index.reconstruct(i)
    print(f"Vector {i}: {vector}")


Vector 0: [ 8.6221062e-03 -6.3371010e-02 -2.3351848e-02  2.8421082e-02
  3.5792995e-02  7.9422817e-03  2.9640123e-02  4.3222451e-04
 -2.2976553e-02 -2.3847319e-02  3.2308351e-02  4.0674087e-02
 -2.4176741e-02  1.3898658e-02  1.1943600e-02  6.4755663e-02
  5.2282590e-02  3.9689336e-02 -1.3745768e-02  4.1048728e-02
  6.5053073e-03 -9.6825697e-03  2.8484771e-02  4.3091562e-02
  4.3893036e-02  1.4030558e-02 -5.5567413e-03 -1.4659890e-02
 -5.7544667e-02  2.2318367e-02  5.1890805e-02 -1.2022385e-02
  1.9964257e-03 -4.7140201e-03  3.1958994e-02  2.9172298e-02
  1.6287817e-02  3.1455860e-02 -1.1849582e-02 -1.1328312e-02
 -8.5079186e-02  1.0444752e-03  9.4285980e-03  1.4255876e-02
 -7.9691857e-03  5.5594575e-03  9.3329810e-03  4.9483452e-02
  2.5086505e-02  1.6514344e-02 -4.2699240e-02 -7.7233445e-03
  2.7758935e-02 -7.3832464e-03  2.0016754e-02  5.3666618e-02
  2.0165112e-02 -1.6645530e-02 -3.2427788e-02 -4.1105837e-02
  2.6860474e-02 -2.0371094e-02 -8.4710876e-03 -2.4583715e-03
  5.5356082e-0

In [3]:
docids = []
with open('indexes/nfcorpus.bge-base-en-v1.5/docid', 'r') as fin:
    docids = [line.rstrip() for line in fin.readlines()]
v1 = index.reconstruct(docids.index('MED-4555'))


In [4]:
from pyserini.encode import AutoDocumentEncoder
encoder = AutoDocumentEncoder('BAAI/bge-base-en-v1.5', device='cpu', pooling='mean', l2_norm=True)
doc_text = '...document content...'
v2 = encoder.encode(doc_text)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import numpy as np
np.linalg.norm(v2[0] - v1)


0.96815604

In [6]:
from pyserini.search.faiss import FaissSearcher, AutoQueryEncoder
encoder = AutoQueryEncoder('BAAI/bge-base-en-v1.5', device='cpu', pooling='mean', l2_norm=True)
searcher = FaissSearcher('indexes/nfcorpus.bge-base-en-v1.5', encoder)
hits = searcher.search('How to Help Prevent Abdominal Aortic Aneurysms')
for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid:7} {hits[i].score:.6f}')


 1 MED-4555 0.791378
 2 MED-4560 0.710725
 3 MED-4421 0.688938
 4 MED-4993 0.686238
 5 MED-4424 0.686214
 6 MED-1663 0.682199
 7 MED-3436 0.680585
 8 MED-2750 0.677033
 9 MED-4324 0.675772
10 MED-2939 0.674646


In [7]:
q_encoder = AutoQueryEncoder('BAAI/bge-base-en-v1.5', device='cpu', pooling='mean', l2_norm=True)
q_vec = q_encoder.encode('How to Help Prevent Abdominal Aortic Aneurysms')
np.dot(q_vec, v1)


0.79137856

In [8]:
from tqdm import tqdm
scores = []
for i in tqdm(range(num_vectors)):
    vector = index.reconstruct(i)
    score = np.dot(q_vec, vector)
    scores.append([docids[i], score])
scores.sort(key=lambda x: -x[1])
for s in scores[:10]:
    print(f'{s[0]} {s[1]:.6f}')


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 3633/3633 [00:00<00:00, 410282.89it/s]

MED-4555 0.791379
MED-4560 0.710725
MED-4421 0.688938
MED-4993 0.686238
MED-4424 0.686214
MED-1663 0.682199
MED-3436 0.680585
MED-2750 0.677033
MED-4324 0.675772
MED-2939 0.674646





In [9]:
import json
with open('collections/nfcorpus/pyserini-corpus/corpus.jsonl', 'w') as out:
    with open('collections/nfcorpus/corpus.jsonl', 'r') as f:
        for line in f:
            l = json.loads(line)
            s = json.dumps({'id': l['_id'], 'contents': l['title'] + ' ' + l['text']})
            out.write(s + '\n')


In [10]:
from pyserini.search.lucene import LuceneSearcher
searcher = LuceneSearcher('indexes/lucene.nfcorpus')
hits = searcher.search('How to Help Prevent Abdominal Aortic Aneurysms')
for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid:7} {hits[i].score:.4f}')

 1 MED-4555 11.9305
 2 MED-4423 8.4771
 3 MED-3180 7.1896
 4 MED-2718 6.0102
 5 MED-1309 5.8181
 6 MED-4424 5.7448
 7 MED-1705 5.6101
 8 MED-4902 5.3639
 9 MED-1009 5.2533
10 MED-1512 5.2068


Jul 07, 2024 12:32:41 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


In [11]:
from pyserini.index.lucene import IndexReader
import json
index_reader = IndexReader('indexes/lucene.nfcorpus')
tf = index_reader.get_document_vector('MED-4555')
bm25_weights = \
    {term: index_reader.compute_bm25_term_weight('MED-4555', term, analyzer=None) \
     for term in tf.keys()}
print(json.dumps(bm25_weights, indent=4, sort_keys=True))


{
    "1.1": 2.2132339477539062,
    "2003": 2.0071027278900146,
    "2008": 1.9285697937011719,
    "3": 0.8006289005279541,
    "3.1": 2.387873649597168,
    "569,000": 3.995551586151123,
    "65": 2.5261287689208984,
    "85": 2.2027695178985596,
    "aaa": 6.861149311065674,
    "abdomin": 2.810744047164917,
    "about": 1.2124857902526855,
    "advoc": 2.9084181785583496,
    "after": 0.6753702163696289,
    "ag": 0.9848723411560059,
    "algorithm": 2.9084181785583496,
    "american": 1.5901331901550293,
    "among": 0.8441131114959717,
    "analysi": 1.1517744064331055,
    "aneurysm": 5.2744975090026855,
    "aortic": 3.8452444076538086,
    "asian": 2.3056604862213135,
    "associ": 0.7863264083862305,
    "background": 0.8268404006958008,
    "black": 1.860482931137085,
    "can": 0.8828715085983276,
    "cardiovascular": 1.1131560802459717,
    "cessat": 3.4776077270507812,
    "chanc": 2.563375949859619,
    "cigarett": 2.2459728717803955,
    "cohort": 1.5951440334320068,


In [12]:
from pyserini.analysis import Analyzer, get_lucene_analyzer
analyzer = Analyzer(get_lucene_analyzer())
query_tokens = analyzer.analyze('How to Help Prevent Abdominal Aortic Aneurysms')
multihot_query_weights = {k: 1 for k in query_tokens}


In [13]:
sum({term: bm25_weights[term] \
     for term in bm25_weights.keys() & \
     multihot_query_weights.keys()}.values())


11.930485963821411

In [14]:
def dot(q_weights, d_weights):
    return sum({term: d_weights[term] \
                for term in d_weights.keys() & \
                q_weights.keys()}.values())
dot(multihot_query_weights, bm25_weights)


11.930485963821411

In [15]:
from pyserini.search.lucene import LuceneSearcher
from pyserini.index.lucene import IndexReader
from tqdm import tqdm

searcher = LuceneSearcher('indexes/lucene.nfcorpus')
index_reader = IndexReader('indexes/lucene.nfcorpus')

scores = []
for i in tqdm(range(0, searcher.num_docs)):
    docid = searcher.doc(i).get('id')
    tf = index_reader.get_document_vector(docid)
    bm25_weights = \
        {term: index_reader.compute_bm25_term_weight(docid, term, analyzer=None) \
         for term in tf.keys()}
    score = dot(multihot_query_weights, bm25_weights)
    scores.append([docid, score])
scores.sort(key=lambda x: -x[1])
for s in scores[:10]:
    print(f'{s[0]} {s[1]:.4f}')


100%|██████████| 3633/3633 [00:05<00:00, 654.19it/s]

MED-4555 11.9305
MED-4423 8.4771
MED-3180 7.1896
MED-2718 6.0102
MED-1309 5.8181
MED-4424 5.7448
MED-1705 5.6101
MED-4902 5.3639
MED-1009 5.2533
MED-1512 5.2068



