In [8]:
from elasticsearch import Elasticsearch
from pprint import pprint
import pandas as pd
import numpy as np
import string
from typing import Callable
import nltk

### Settings for ElasticSearch

In [9]:
INDEX_NAME = "passage_index"
INDEX_SETTINGS = {
    'settings': {
        'index': {
            'number_of_shards': 1,
            'number_of_replicas': 1,
            'similarity': {
                'default': {
                    'type': 'BM25'
                }
            }
        },
        "analysis": {
            "analyzer": {
                "my_english_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "stopwords": "_english_",
                    "filter": [
                        "lowercase",
                        "english_stop",
                        "filter_english_minimal"
                    ]                
                }
            },
            "filter" : {
                "filter_english_minimal" : {
                    "type": "stemmer",
                    "name": "minimal_english"
                },
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_"
                }
            },
        }
    }
}

### Create ElasticSearch object

In [10]:
es = Elasticsearch()
es.info()

{'name': 'DESKTOP-46IMAQM',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'c8tMOObLS2WyDCagWeT0hg',
 'version': {'number': '7.17.6',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': 'f65e9d338dc1d07b642e14a27f338990148ee5b6',
  'build_date': '2022-08-23T11:08:48.893373482Z',
  'build_snapshot': False,
  'lucene_version': '8.11.1',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [11]:
# Check indexing
es.indices.get_alias("*")

  es.indices.get_alias("*")


{}

In [12]:
# Delete if necessary
if es.indices.exists(INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)

  if es.indices.exists(INDEX_NAME):


### Create indexes for the passages

In [13]:
es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

  es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'passage_index'}

### Add documents to the index

In [14]:
# Bulk indexing
filename = "data/collection/collection.tsv"

bulk_data = []
bulk_size = 50000 # However many documents can be stored in memory
with open(filename, encoding="utf-8") as file:
    for line in file:
        if len(bulk_data) > bulk_size:
            es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True, request_timeout=60)
            bulk_data.clear()

        l = line.split('\t')
        docid = int(l[0])
        text = l[1].strip()

        doc = {"doc_id": docid, "content": text}

        bulk_data.append({"index": {"_index": INDEX_NAME, "_id": doc.pop("doc_id")}})
        bulk_data.append(doc)
    
    es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True, request_timeout=60)
    bulk_data.clear()

In [15]:
doc = es.get(index=INDEX_NAME, id=1)
pprint(doc)

{'_id': '1',
 '_index': 'passage_index',
 '_primary_term': 1,
 '_seq_no': 1,
 '_source': {'content': 'The Manhattan Project and its atomic bomb helped '
                        'bring an end to World War II. Its legacy of peaceful '
                        'uses of atomic energy continues to have an impact on '
                        'history and science.'},
 '_type': '_doc',
 '_version': 1,
 'found': True}


### Search in the document

In [16]:
query = "atomic bomb"
res = es.search(index=INDEX_NAME, q=query, _source=False, size=10, request_timeout=60)

In [17]:
print("Scores for query: " + query)
for hit in res["hits"]["hits"]:
    print("Doc ID: %3r  Score: %5.2f" % (hit["_id"], hit["_score"]))

Scores for query: atomic bomb
Doc ID: '1478667'  Score: 25.84
Doc ID: '2980807'  Score: 25.46
Doc ID: '749030'  Score: 24.82
Doc ID: '1653933'  Score: 24.65
Doc ID: '1737931'  Score: 24.62
Doc ID: '5169557'  Score: 24.53
Doc ID: '2840370'  Score: 24.43
Doc ID: '2991343'  Score: 24.43
Doc ID: '6142042'  Score: 24.42
Doc ID: '3386237'  Score: 24.39


In [18]:
print("Document with Doc ID: 1478667")
es.get(index=INDEX_NAME, id=1478667)["_source"]["content"]

Document with Doc ID: 1478667


'1945: US drops atomic bomb on Hiroshima The first atomic bomb has been dropped by a United States aircraft on the Japanese city of Hiroshima.1945: US drops atomic bomb on Hiroshima.945: US drops atomic bomb on Hiroshima The first atomic bomb has been dropped by a United States aircraft on the Japanese city of Hiroshima. 1945: US drops atomic bomb on Hiroshima.'

In [19]:
top_k_scores = [hit["_id"] for hit in res["hits"]["hits"]]
top_k_scores

['1478667',
 '2980807',
 '749030',
 '1653933',
 '1737931',
 '5169557',
 '2840370',
 '2991343',
 '6142042',
 '3386237']

### Make QRELS evaluation-able
- "query_id": ["doc_id1", "doc_id2"...] -> Relevant ones, ground truth, Set() in this case

In [20]:
# Bulk indexing
qrelspath = "data/qrels/qrels.txt"

qrels = {}
with open(qrelspath, encoding="utf-8") as file:
    for line in file:
        l = line.split(' ')

        qid = l[0]
        pid = l[2]
        relevance = int(l[3])

        if relevance > 0:
            if qid in qrels.keys():
                qrels[qid].add(pid)
            else:
                qrels[qid] = set([pid])

In [21]:
qrels["19335"]

{'1720389',
 '1720395',
 '1729',
 '2046505',
 '3045565',
 '3045567',
 '3175481',
 '3175484',
 '527690',
 '527692',
 '527697',
 '6452949',
 '7122355',
 '7320614',
 '819168',
 '8412681',
 '8412682',
 '8412683',
 '8412684',
 '8412685'}

In [22]:
len(qrels.keys())

43

In [23]:
qrel_query_ids = list(qrels.keys())
qrel_query_ids[0]

'19335'

### Read queries

In [24]:
queries_eval = pd.read_csv("data/queries/queries.eval.tsv", sep='\t', header=None).values

In [25]:
queries_id = np.array(queries_eval[:, 0])
queries = np.array(queries_eval[:, 1])
print(queries_id[0])
print(queries[0])

786436
what is prescribed to treat thyroid storm


In [26]:
def relevant_queries(queries, qrels):
    relevant_queries = []
    relevant_queries_id = []

    for idx, query in enumerate(queries):
        query_id = str(queries_id[idx])
        if query_id in qrels:
            relevant_queries.append(query)
            relevant_queries_id.append(query_id)

    return relevant_queries, relevant_queries_id

In [27]:
# Keep only queries in the QRELS
queries, queries_id = relevant_queries(queries, qrel_query_ids)
print(queries_id[0])
print(queries[0])

527433
types of dysarthria from cerebral palsy


### Non bulk query search

In [28]:
# Non-bulk
query_topK = {}
for idx, query_id in enumerate(queries_id):
    query = queries[idx]
    res = es.search(index=INDEX_NAME, q=query, _source=False, size=1000, request_timeout=60)
    top_k_scores = [hit["_id"] for hit in res["hits"]["hits"]]
    query_topK[query_id] = top_k_scores



### Evaluation
- MAP (Mean Average Precision)
- MRR (Mean Reciprocal Recipient)

In [29]:
def get_average_precision(system_ranking, ground_truth) -> float:
    vals = []
    over = 1
    for rank_idx, rank in enumerate(system_ranking):
        under = rank_idx+1
        if rank in ground_truth:
            vals.append(over / under)
            over += 1
    AP = sum(vals) / len(ground_truth)

    return AP

In [30]:
system_ranking = query_topK[queries_id[0]] # List
system_truth = qrels[queries_id[0]] # Set
score = get_average_precision(system_ranking, system_truth)
score

0.04166987630156881

In [31]:
def get_reciprocal_rank(system_ranking, ground_truth) -> float:
    AP = 0
    for rank_idx, rank in enumerate(system_ranking):
        under = rank_idx+1
        if rank in ground_truth:
            AP = 1 / under
            break
    
    return AP

In [32]:
system_ranking = query_topK[queries_id[0]] # List
system_truth = qrels[queries_id[0]] # Set
score = get_reciprocal_rank(system_ranking, system_truth)
score

1.0

In [33]:
def get_mean_eval_measure(system_rankings, ground_truths, eval_function: Callable) -> float:
    results = []
    for query in system_rankings:
        if query in ground_truths.keys():
            results.append(eval_function(system_rankings[query], ground_truths[query]))
        else:
            continue
            # results.append(0) -> ?
    return sum(results) / len(results)

In [34]:
map = get_mean_eval_measure(query_topK, qrels, get_average_precision)
mrr = get_mean_eval_measure(query_topK, qrels, get_reciprocal_rank)

In [35]:
print("Evaluation of the baseline: BM25")
print("Mean average precision: " + str(np.round(map, 5)))
print("Mean average precision: " + str(np.round(mrr, 5)))

Evaluation of the baseline: BM25
Mean average precision: 0.20797
Mean average precision: 0.68902
