In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
from sentence_transformers import SentenceTransformer, util
import ir_measures
from ir_measures import *
import pandas as pd
import json
from tqdm import tqdm
from time import time


2023-03-21 03:10:28.260215: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-21 03:10:28.580739: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-21 03:10:29.663083: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-21 03:10:29.663132: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

### Connection

In [2]:
es = Elasticsearch('http://localhost:9200')


### Index Configuration

In [3]:
index_name = 'wiki'

mappings = {
    'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'white'
        }
    }
}

settings = {
    "number_of_shards" : 5,
    'index' : {
        'similarity' : {
          'default' : {
            'type' : 'BM25'   # it's a deafault configuration though
          }
        }
    },
    'analysis' : {
        'analyzer' : {
            'white' : {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'wiki'}

### WikiIR Collection

In [4]:
df = pd.read_csv('wikIR1k/documents.csv')

print(df.shape)
df.head()


(369721, 2)


Unnamed: 0,id_right,text_right
0,1781133,it was used in landing craft during world war ...
1,2426736,after rejecting an offer from cambridge univer...
2,2224122,mat zan coached kuala lumpur fa in 1999 and wo...
3,219642,a barcode is a machine readable optical label ...
4,1728654,since the subordination of the monarchy under ...


### Indexing documents

In [5]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }


def es_action_generator(df):
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0], bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
        doc = {
            'text': row['text_right'],
        }
        yield create_es_action(index_name, row['id_right'], doc)


start = time()
for ok, result in parallel_bulk(es, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        
es.indices.refresh(index=index_name)
es.cat.count(index=index_name, format='json')


100%|██████████████████████████████| 369721/369721 [00:34<00:00, 10672.43it/s]  


Indexing time: 35.22864580154419


[{'epoch': '1679346670', 'timestamp': '21:11:10', 'count': '369721'}]

### Train, Test queries

In [6]:
test_queries = pd.read_csv('wikIR1k/test/queries.csv')
test_queries

Unnamed: 0,id_left,text_left
0,158491,southern methodist university
1,5728,halakha
2,13554,chief justice of the united states
3,32674,patsy cline
4,406391,dierks bentley
...,...,...
95,679227,hiv aids
96,2136797,maren morris
97,5622,homer
98,1313598,south pole


In [7]:
def pretty_print_result(search_result, fields=[]):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
    
def search(query, *args):
    return pretty_print_result(es.search(index=index_name, query=query, size=20), args)

def get_doc_by_id(doc_id):
    return es.get(index=index_name, id=doc_id)['_source']


In [8]:
def make_query(text):
    return {
        'bool': {
            'must': {
                'match': {
                    'text': text
                }
            }
        }
    }

search(make_query(test_queries['text_left'][0]))

Total documents: 10000
Doc 1880296, score is 17.230719
Doc 607552, score is 17.198406
Doc 2261272, score is 17.183655
Doc 1957435, score is 16.908918
Doc 625257, score is 16.856976
Doc 635537, score is 16.771313
Doc 1774491, score is 16.640131
Doc 663828, score is 16.487574
Doc 158491, score is 15.997955
Doc 1956922, score is 15.973572
Doc 1180246, score is 15.590252
Doc 1170039, score is 15.534702
Doc 945068, score is 15.526761
Doc 589549, score is 15.501228
Doc 360918, score is 15.501228
Doc 685181, score is 15.335788
Doc 2411344, score is 15.325968
Doc 1158969, score is 15.273922
Doc 1093529, score is 15.163386
Doc 742912, score is 15.109789


## Initial evaluation

In [9]:
run = {}

for i, row in test_queries.iterrows():
    search_res = es.search(index=index_name, query=make_query(row['text_left']), size=20)['hits']
    run[str(row['id_left'])] = {}
                 
    for hit in search_res['hits']:
        run[str(row['id_left'])][hit['_id']] = hit['_score']
                 

In [10]:
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')

ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run)


{P@20: 0.14800000000000005,
 P@10: 0.20599999999999988,
 AP@20: 0.14619425811737782}

## MSMARCO Model and Cosine similarity

In [11]:
def get_run_from_model(model, run):
    '''
    Args:
        model: SentenceTransformer model
        run: query-document-score dict-of-dict
    Returns:
        run_cosine: query-document-cosine_similarity_score dict-of-dict
    '''

    q_ids = list(run.keys())  # q_ids -- list of strings
    run_cosine = {}

    for q_id in tqdm(q_ids, total=len(q_ids), bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):

        # query encoding
        query_text = test_queries[test_queries['id_left']==int(q_id)].iloc[0]['text_left']
        query_embedding = model.encode(query_text)
        run_cosine[q_id] = {}

        # documents encoding
        docs_texts = []
        for doc_id in run[str(q_id)]:
            docs_texts.append(df[df['id_right']==int(doc_id)].iloc[0]['text_right'])
        docs_embedding = model.encode(docs_texts)

        # computing cosine similarity
        if len(docs_embedding) == 0:
            continue
        cos_sim = util.cos_sim(query_embedding, docs_embedding)[0]
        for i, doc_id in enumerate(run[str(q_id)]):
            run_cosine[q_id][doc_id] = cos_sim[i].item()
            
    return run_cosine


In [12]:
model = SentenceTransformer('msmarco-distilbert-cos-v5')
#model = SentenceTransformer('msmarco-MiniLM-L6-cos-v5')

run_cosine = get_run_from_model(model, run)

100%|██████████████████████████████| 100/100 [04:08<00:00,  2.48s/it]                                                                                                                                              


In [13]:
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')

ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run_cosine)


{P@20: 0.14800000000000005,
 P@10: 0.2349999999999999,
 AP@20: 0.17031915835189756}