In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
from sentence_transformers import SentenceTransformer, util
import ir_measures
from ir_measures import *
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from time import time


2023-03-21 14:20:23.488824: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-21 14:20:23.591763: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-21 14:20:24.106937: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-21 14:20:24.106988: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

### Connection

In [2]:
es = Elasticsearch('http://localhost:9200')


### Index Configuration

In [3]:
index_name = 'wiki'

mappings = {
    'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'white'
        }
    }
}

settings = {
    'index' : {
        'similarity' : {
          'default' : {
            'type' : 'BM25'   # it's a deafault configuration though
          }
        }
    },
    'analysis' : {
        'analyzer' : {
            'white' : {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'wiki'}

### WikiIR Collection

In [4]:
df = pd.read_csv('wikIR1k/documents.csv')

print(df.shape)
df.head()


(369721, 2)


Unnamed: 0,id_right,text_right
0,1781133,it was used in landing craft during world war ...
1,2426736,after rejecting an offer from cambridge univer...
2,2224122,mat zan coached kuala lumpur fa in 1999 and wo...
3,219642,a barcode is a machine readable optical label ...
4,1728654,since the subordination of the monarchy under ...


### Indexing documents

In [5]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }


def es_action_generator(df):
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0], bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
        doc = {
            'text': row['text_right'],
        }
        yield create_es_action(index_name, row['id_right'], doc)


start = time()
for ok, result in parallel_bulk(es, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        
es.indices.refresh(index=index_name)
es.cat.count(index=index_name, format='json')


100%|██████████████████████████████| 369721/369721 [00:28<00:00, 13149.87it/s]  


Indexing time: 28.21251106262207


[{'epoch': '1679386856', 'timestamp': '08:20:56', 'count': '369721'}]

### Train queries

In [6]:
train_queries = pd.read_csv('wikIR1k/training/queries.csv')
train_queries

Unnamed: 0,id_left,text_left
0,123839,yanni
1,188629,k pop
2,13898,venice film festival
3,316959,downtown brooklyn
4,515031,pennsylvania house of representatives
...,...,...
1439,896124,british ceylon
1440,12319,scottish national party
1441,4421,cinema of china
1442,296526,gold mining


In [7]:
def pretty_print_result(search_result, fields=[]):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
    
def search(query, *args):
    return pretty_print_result(es.search(index=index_name, query=query, size=20), args)

def get_doc_by_id(doc_id):
    return es.get(index=index_name, id=doc_id)['_source']


In [8]:
def make_query(text):
    return {
        'bool': {
            'must': {
                'match': {
                    'text': text
                }
            }
        }
    }

search(make_query(train_queries['text_left'][0]))

Total documents: 11
Doc 806300, score is 18.998993
Doc 123839, score is 18.381817
Doc 806326, score is 17.52774
Doc 836567, score is 17.52774
Doc 1793430, score is 16.267975
Doc 806075, score is 16.267975
Doc 806263, score is 14.223423
Doc 799188, score is 10.328993
Doc 1727316, score is 10.328993
Doc 817579, score is 10.328993
Doc 1901730, score is 10.328993


# Re-ranking

## Sampling train queries

In [9]:
train_queries_sample = train_queries.sample(500, random_state=42)
train_queries_sample

Unnamed: 0,id_left,text_left
413,32411,the salvation army
316,37955,duisburg
1034,128473,north somerset
65,32945,seine et marne
1024,22478,brisbane river
...,...,...
1368,90040,episcopal see
1289,575724,telugu cinema
838,77289,oviedo
266,605568,upper carniola


## Top50 documents by BM25

In [10]:
# Generating query-doc-score dict-of-dict

run = {}

for i, row in train_queries_sample.iterrows():
    search_res = es.search(index=index_name, query=make_query(row['text_left']), size=50)['hits']
    run[str(row['id_left'])] = {}
                 
    for hit in search_res['hits']:
        run[str(row['id_left'])][hit['_id']] = hit['_score']
                 

In [11]:
def normalize_run(run):
    '''
    Desc:
        Min-Max normalization of scores into [0,1] range 
    Args:
        run: dict-of-dict with query-doc-scores
    Returns:
        run_norm: dict-of-dict with normalized scores
    '''

    run_norm = {}
    
    for query in run.keys():
        run_norm[query] = {}
        
        if len(run[query].values()) == 0:
            continue
            
        scores_min = min(run[query].values())
        scores_max = max(run[query].values())
                
        for doc in run[query].keys():
            # division by zero only happens when search returns docs, all having the same score (min=max)
            # so we consider the following cases
            if scores_max-scores_min == 0:
                run_norm[query][doc] = (run[query][doc]-scores_min)/scores_max
            else:
                run_norm[query][doc] = (run[query][doc]-scores_min)/(scores_max-scores_min)
                
    return run_norm


run_norm = normalize_run(run)  # min-max normalizing BM25 scores
run_norm[next(iter(run_norm))] # Example: first query normalized docs scores


{'911461': 1.0,
 '2088120': 0.9622073314809564,
 '646427': 0.9468590626380333,
 '2454505': 0.7793849840246231,
 '2326181': 0.7793821713550938,
 '857789': 0.6798974641323109,
 '1617489': 0.6359824344100116,
 '325069': 0.6359824344100116,
 '2179593': 0.635981848437193,
 '1233281': 0.635981848437193,
 '1197474': 0.6359813796589382,
 '1639868': 0.6359804421024283,
 '2056307': 0.5648390057260092,
 '32411': 0.548223746454718,
 '337682': 0.4801999198623575,
 '408298': 0.47452699981096536,
 '864603': 0.4408318048232361,
 '452908': 0.4335817975278744,
 '403464': 0.40324680507110267,
 '864601': 0.4032456331254655,
 '2075605': 0.4032456331254655,
 '1271293': 0.4032448127635195,
 '137941': 0.40324410959613705,
 '1567748': 0.40324410959613705,
 '613220': 0.40324410959613705,
 '651196': 0.4032405937592254,
 '1870777': 0.4032371951168776,
 '688623': 0.2844793731122153,
 '2283637': 0.26747315277635675,
 '1820798': 0.26747104327420995,
 '954099': 0.2406886868220222,
 '1326315': 0.21091236085049964,
 '8

## Getting cosines for query-document embeddings

In [12]:
def get_run_from_model(model, queries, run):
    '''
    Desc:
        Calculating cosine similarity scores of query and doc embeddings obtained by the model 
    Args:
        model: SentenceTransformer model
        queries: pandas DataFrame containing queries
        run: query-document-score dict-of-dict
    Returns:
        run_cosine: query-document-cosine_similarity_score dict-of-dict
    '''

    q_ids = list(run.keys())  # q_ids -- list of strings
    run_cosine = {}

    for q_id in tqdm(q_ids, total=len(q_ids), bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):

        # query encoding
        query_text = queries[queries['id_left']==int(q_id)]['text_left'].item()
        query_embedding = model.encode(query_text)
        run_cosine[q_id] = {}

        # documents encoding
        docs_texts = []
        for doc_id in run[str(q_id)]:
            docs_texts.append(df[df['id_right']==int(doc_id)]['text_right'].item())
        docs_embedding = model.encode(docs_texts)

        # computing cosine similarity
        if len(docs_embedding) == 0:
            continue
        cos_sim = util.cos_sim(query_embedding, docs_embedding)[0]
        for i, doc_id in enumerate(run[str(q_id)]):
            run_cosine[q_id][doc_id] = cos_sim[i].item()
            
    return run_cosine


In [13]:
model = SentenceTransformer('msmarco-distilbert-cos-v5')
#model = SentenceTransformer('msmarco-MiniLM-L6-cos-v5')

run_cosine = get_run_from_model(model, train_queries_sample, run)


100%|██████████████████████████████| 500/500 [42:01<00:00,  5.04s/it]           


## Searhing for the optimal alpha

In [14]:
def combine_results(run1, run2, alpha):
    '''
    Args:
        run1: 1st scores
        run2: 2nd scores
        alpha: scalar
    Returns:
        run_mix: combination of score1 and score2
    '''

    run_mix = {}
    
    for query in run1.keys():
        run_mix[query] = {}
        
        for doc in run1[query].keys():
            bm25 = run1[query][doc]
            cos_sim = run2[query][doc]
            
            run_mix[query][doc] = alpha*bm25 + (1-alpha)*cos_sim
    
    return run_mix


In [15]:
# Taking the corresponding sample of queries of qrels
samples_q_ids = train_queries_sample['id_left'].values.tolist()

qrels = list(ir_measures.read_trec_qrels('wikIR1k/training/qrels'))

qrels_sample = []
for qrel in qrels:
    if int(qrel[0]) in samples_q_ids:
        qrels_sample.append(qrel)


In [16]:
alpha_range = np.linspace(0, 1, 100)

In [17]:
results = {}

for alpha in alpha_range:
    run_mix = combine_results(run_norm, run_cosine, alpha)
    map20 = ir_measures.calc_aggregate([MAP@20], qrels_sample, run_mix)
    map20 = list(map20.values())[0]
    results[alpha] = map20


In [18]:
results_sorted = dict(sorted(results.items(), key=lambda item: item[1], reverse=True))

print('alpha \t MAP@20\n---------------')
for alpha, score in results_sorted.items():
    print(f'{round(alpha,4)}\t{round(score,5)}')


alpha_best = next(iter(results_sorted))
print(f'\nBest alpha: {alpha_best},\nMAP@20 Score: {results_sorted[alpha_best]}')


alpha 	 MAP@20
---------------
0.1515	0.18491
0.1414	0.18486
0.1313	0.18482
0.1616	0.18481
0.1919	0.18477
0.1212	0.18469
0.2323	0.18461
0.2222	0.18454
0.1717	0.18453
0.202	0.18448
0.2424	0.18435
0.2121	0.18433
0.1818	0.18433
0.2626	0.18425
0.2525	0.18425
0.1111	0.18408
0.2828	0.18395
0.2727	0.18393
0.101	0.18367
0.2929	0.1834
0.0909	0.18338
0.303	0.183
0.0808	0.18287
0.3131	0.18258
0.3232	0.18249
0.3333	0.18228
0.0707	0.18202
0.0606	0.182
0.3434	0.1817
0.0505	0.18131
0.3535	0.18076
0.3636	0.18038
0.0404	0.18029
0.0303	0.18019
0.3737	0.17999
0.3838	0.17953
0.0202	0.17929
0.3939	0.17895
0.0101	0.17892
0.404	0.17798
0.0	0.17795
0.4141	0.17777
0.4242	0.17711
0.4343	0.17652
0.4444	0.17608
0.4545	0.17576
0.4646	0.17558
0.4747	0.17493
0.4848	0.17451
0.4949	0.17406
0.5051	0.17378
0.5152	0.1731
0.5253	0.17278
0.5354	0.17258
0.5455	0.1721
0.5556	0.17145
0.5657	0.17085
0.5758	0.17039
0.5859	0.16997
0.596	0.16917
0.6061	0.16882
0.6162	0.16829
0.6263	0.16781
0.6364	0.16768
0.6465	0.16738
0.6566	0.1

##  Applying the formula to test queries

In [19]:
test_queries = pd.read_csv('wikIR1k/test/queries.csv')
test_queries

Unnamed: 0,id_left,text_left
0,158491,southern methodist university
1,5728,halakha
2,13554,chief justice of the united states
3,32674,patsy cline
4,406391,dierks bentley
...,...,...
95,679227,hiv aids
96,2136797,maren morris
97,5622,homer
98,1313598,south pole


### BM25 scores

In [20]:
run_bm25 = {}

for i, row in test_queries.iterrows():
    search_res = es.search(index=index_name, query=make_query(row['text_left']), size=50)['hits']
    run_bm25[str(row['id_left'])] = {}
                 
    for hit in search_res['hits']:
        run_bm25[str(row['id_left'])][hit['_id']] = hit['_score']

qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')
print(ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run_bm25))


{P@20: 0.14900000000000005, P@10: 0.20699999999999988, AP@20: 0.1482622229022594}


### Cosine similarity results

In [21]:
run_cosine = get_run_from_model(model, test_queries, run_bm25)


100%|██████████████████████████████| 100/100 [08:04<00:00,  4.85s/it]           


In [22]:
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')
print(ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run_cosine))


{P@20: 0.15900000000000003, P@10: 0.23200000000000007, AP@20: 0.17708629178647126}


### BM25 + Cosine (with alpha)

In [23]:
# normalizing bm25 results
run_bm25_norm = normalize_run(run_bm25)

In [24]:
# combining BM25 and cosine similarity results 
run_final = combine_results(run_bm25_norm, run_cosine, alpha_best)

In [25]:
# evaluation
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')
print(ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run_final))


{P@20: 0.16500000000000006, P@10: 0.24200000000000002, AP@20: 0.18205409450202997}
