In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
import ir_measures
from ir_measures import *
import pandas as pd
import json
from tqdm import tqdm
from time import time
import spacy


### Connection

In [2]:
es = Elasticsearch('http://localhost:9200')


In [3]:
index_name = 'wiki'


# Case 1: Without Stemming

### Index Configuration

In [4]:
mappings = {
    'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'white'
        }
    }
}

settings = {
    'analysis' : {
        'analyzer' : {
            'white' : {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'wiki'})

### Checking analyzer

In [5]:
def check_analyzer(analyzer, text):
    
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index=index_name, body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens

text = 'I am using elastic search'
analyzer = {
    'analyzer': 'white'
}

check_analyzer(analyzer, text)


  tokens = es.indices.analyze(index=index_name, body=body)['tokens']


['I', 'am', 'using', 'elastic', 'search']

### WikiIR collection

In [6]:
df = pd.read_csv('wikIR1k/documents.csv')

print(df.shape)
df


(369721, 2)


Unnamed: 0,id_right,text_right
0,1781133,it was used in landing craft during world war ...
1,2426736,after rejecting an offer from cambridge univer...
2,2224122,mat zan coached kuala lumpur fa in 1999 and wo...
3,219642,a barcode is a machine readable optical label ...
4,1728654,since the subordination of the monarchy under ...
...,...,...
369716,59396,the population was 416 at the 2010 census the ...
369717,1950034,the surface of the river is frozen from novemb...
369718,1984468,the first anti thrombin aptamer tba was genera...
369719,33966,state of oklahoma as of the 2010 census the po...


### Indexing documents

In [7]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }


def es_action_generator(df):
    for doc_id, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc = {
            'text': row['text_right'],
        }
        yield create_es_action(index_name, row['id_right'], doc)


start = time()
for ok, result in parallel_bulk(es, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        
es.indices.refresh(index=index_name)
es.cat.count(index=index_name, format='json')


100%|████████████████████████████████| 369721/369721 [00:30<00:00, 11942.82it/s]


Indexing time: 31.101529836654663


ListApiResponse([{'epoch': '1676905279', 'timestamp': '15:01:19', 'count': '369721'}])

In [8]:
def pretty_print_result(search_result, fields=[]):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
    
def search(query, *args):
    return pretty_print_result(es.search(index=index_name, query=query, size=20), args)

def get_doc_by_id(doc_id):
    return es.get(index=index_name, id=doc_id)['_source']

# customized
def get_results(search_result):
    res = search_result['hits']
    for hit in res['hits']:        
        return hit['_id'], hit['_score']

def search_results(query_id, query):
    res = es.search(index=index_name, query=query, size=20)['hits']
    ret = []
    for rank, hit in enumerate(res['hits']):
        ret.append((str(query_id), str(hit['_id']), hit['_score'], rank))
    return ret


### Queries

In [9]:
test_queries = pd.read_csv('wikIR1k/test/queries.csv')
test_queries


Unnamed: 0,id_left,text_left
0,158491,southern methodist university
1,5728,halakha
2,13554,chief justice of the united states
3,32674,patsy cline
4,406391,dierks bentley
...,...,...
95,679227,hiv aids
96,2136797,maren morris
97,5622,homer
98,1313598,south pole


### Generating documents' scores

In [10]:
def make_query(text):
    return {
        'bool': {
            'must': {
                'match': {
                    'text': text
                }
            }
        }
    }

def generate_run(test_queries, save_to_file=False, filename=None):
    if save_to_file:
        f = open(filename, 'w')
    
    run = []
    for i, row in test_queries.iterrows():
        for res in search_results(row['id_left'], make_query(row['text_left'])):
            run.append(ir_measures.ScoredDoc(res[0], res[1], res[2]))
            if save_to_file:
                f.write(f'{res[0]} Q0 {res[1]} {res[3]} {res[2]} BM25\n')
    
    if save_to_file:
        f.close()

    return run

def print_scores(run, total=-1):
    if total > len(run) or total==-1:
        total = len(run)
    for i in range(total):
        print(run[i])

run = generate_run(test_queries, True, 'search_without_stem.res')
print_scores(run, 10)


ScoredDoc(query_id='158491', doc_id='1880296', score=17.35782)
ScoredDoc(query_id='158491', doc_id='2261272', score=17.199305)
ScoredDoc(query_id='158491', doc_id='607552', score=17.118353)
ScoredDoc(query_id='158491', doc_id='1957435', score=16.929768)
ScoredDoc(query_id='158491', doc_id='625257', score=16.877832)
ScoredDoc(query_id='158491', doc_id='635537', score=16.80612)
ScoredDoc(query_id='158491', doc_id='1774491', score=16.570059)
ScoredDoc(query_id='158491', doc_id='663828', score=16.55547)
ScoredDoc(query_id='158491', doc_id='158491', score=16.063732)
ScoredDoc(query_id='158491', doc_id='1956922', score=15.829921)


In [11]:
start = time()
generate_run(test_queries)
stop = time()

print(f'Query execution time (total): {stop-start} s')

Query execution time (total): 0.5423238277435303 s


### Evaluation of BM25 results

In [12]:
# BM25 results
run = ir_measures.read_trec_run('wikIR1k/test/BM25.res')
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')

ir_measures.calc_aggregate([P@5, P@20, AP], qrels, run)


{P@20: 0.09499999999999999, AP: 0.11196168401599797, P@5: 0.18399999999999994}

### Evaluation

In [13]:
# Our results
run = ir_measures.read_trec_run('search_without_stem.res')

qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')

ir_measures.calc_aggregate([P@5, P@20, AP], qrels, run)


{P@20: 0.14750000000000005, AP: 0.14794540941671017, P@5: 0.3059999999999997}

# Case 2: With Stemming

In [14]:
mappings = {
    'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'porter_stemmer'
        }
    }
}

settings = {
    'analysis' : {
        'analyzer' : {
            'porter_stemmer' : {
                'tokenizer' : 'whitespace',
                'filter' : ['porter_stem']
            }
        },
        'filter' : {
            'porter_stem' : {
                'type' : 'porter_stem',
                'language' : 'English'
            }
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'wiki'})

In [15]:
text = 'I am using elastic search'
analyzer = {
    'analyzer': 'porter_stemmer'
}

check_analyzer(analyzer, text)


  tokens = es.indices.analyze(index=index_name, body=body)['tokens']


['I', 'am', 'us', 'elast', 'search']

In [16]:
start = time()
for ok, result in parallel_bulk(es, es_action_generator(df), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        
es.indices.refresh(index=index_name)
es.cat.count(index=index_name, format='json')


100%|████████████████████████████████| 369721/369721 [00:35<00:00, 10513.27it/s]


Indexing time: 35.28420877456665


ListApiResponse([{'epoch': '1676905317', 'timestamp': '15:01:57', 'count': '369721'}])

In [17]:
run = generate_run(test_queries=test_queries, save_to_file=True, filename='search_with_stem.res')
print_scores(run, 10)


ScoredDoc(query_id='158491', doc_id='1880296', score=17.132378)
ScoredDoc(query_id='158491', doc_id='2261272', score=16.981)
ScoredDoc(query_id='158491', doc_id='607552', score=16.919212)
ScoredDoc(query_id='158491', doc_id='625257', score=16.701923)
ScoredDoc(query_id='158491', doc_id='1957435', score=16.695692)
ScoredDoc(query_id='158491', doc_id='635537', score=16.567877)
ScoredDoc(query_id='158491', doc_id='663828', score=16.392046)
ScoredDoc(query_id='158491', doc_id='1774491', score=16.352182)
ScoredDoc(query_id='158491', doc_id='158491', score=16.30711)
ScoredDoc(query_id='158491', doc_id='1956922', score=15.618095)


In [18]:
run = ir_measures.read_trec_run('search_with_stem.res')
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')

ir_measures.calc_aggregate([P@5, P@20, AP], qrels, run)


{P@20: 0.14400000000000002, AP: 0.1464517410121761, P@5: 0.3119999999999997}

# Case 3: Lemmatized Collection

### WikiIR documents (lemmatization)

In [19]:
#################### for the first run ###########################
# takes some time (~2 hours)

# docs_list = []
# nlp = spacy.load('en_core_web_sm')

# for i, row in tqdm(df.iterrows(), total=df.shape[0]):
#     nlp_doc = nlp(row['text_right'])
#     new_doc = ''
#     for token in nlp_doc:
#         new_doc = new_doc + ' ' + token.lemma_
#     docs_list.append(new_doc)

# df_lemmatized = pd.DataFrame({'id_right': df['id_right'].values, 'text_right': docs_list})
# df_lemmatized.to_csv('docs_lemmatized.csv', index=None)

#################### Later: Reading from files ###########################

df_lemmatized = pd.read_csv('docs_lemmatized.csv')
df_lemmatized


Unnamed: 0,id_right,text_right
0,1781133,it be use in landing craft during world war i...
1,2426736,after reject an offer from cambridge universi...
2,2224122,mat zan coach kuala lumpur fa in 1999 and win...
3,219642,a barcode be a machine readable optical label...
4,1728654,since the subordination of the monarchy under...
...,...,...
369716,59396,the population be 416 at the 2010 census the ...
369717,1950034,the surface of the river be freeze from novem...
369718,1984468,the first anti thrombin aptamer tba be genera...
369719,33966,state of oklahoma as of the 2010 census the p...


### Queries (lemmatization)

In [20]:
queries_list = []
nlp = spacy.load('en_core_web_sm')

for i, row in tqdm(test_queries.iterrows(), total=test_queries.shape[0]):
    nlp_query = nlp(row['text_left'])
    new_query = ''
    for token in nlp_query:
        new_query = new_query + ' ' + token.lemma_
    queries_list.append(new_query)

test_queries_lemmatized = pd.DataFrame({'id_left': test_queries['id_left'].values, 'text_left': queries_list})
# test_queries_lemmatized.to_csv('queries_lemmatized.csv', index=None)

test_queries_lemmatized

100%|████████████████████████████████████████| 100/100 [00:00<00:00, 253.33it/s]


Unnamed: 0,id_left,text_left
0,158491,southern methodist university
1,5728,halakha
2,13554,chief justice of the united states
3,32674,patsy cline
4,406391,dierk bentley
...,...,...
95,679227,hiv aid
96,2136797,maren morris
97,5622,homer
98,1313598,south pole


In [21]:
mappings = {
    'properties': {
        'text': {
            'type': 'text',
            'analyzer': 'white'
        }
    }
}

settings = {
    'analysis' : {
        'analyzer' : {
            'white' : {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, settings=settings, mappings=mappings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'wiki'})

In [22]:
start = time()
for ok, result in parallel_bulk(es, es_action_generator(df_lemmatized), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
stop = time()

print('Indexing time:', stop-start)
        
es.indices.refresh(index=index_name)
es.cat.count(index=index_name, format='json')


100%|████████████████████████████████| 369721/369721 [00:33<00:00, 11151.17it/s]


Indexing time: 33.26123762130737


ListApiResponse([{'epoch': '1676905356', 'timestamp': '15:02:36', 'count': '369721'}])

In [23]:
def make_query(text):
    query = {
        "bool": {
            'must': {
                'match': {
                    'text': text
                }                    
            },
            'should': {
                "match_phrase": {
                    "text": {
                        "query": text,
                        "boost": 5
                    }
                }
            }
        }
    }
    return query

def generate_run(test_queries, save_to_file=False, filename=None):
    if save_to_file:
        f = open(filename, 'w')
    
    run = []
    for i, row in test_queries.iterrows():
        for res in search_results(row['id_left'], make_query(row['text_left'])):
            run.append(ir_measures.ScoredDoc(res[0], res[1], res[2]))
            if save_to_file:
                f.write(f'{res[0]} Q0 {res[1]} {res[3]} {res[2]} BM25\n')
    
    if save_to_file:
        f.close()

    return run


run = generate_run(test_queries=test_queries_lemmatized, save_to_file=True, filename='search_with_lemma.res')
print_scores(run, 10)


ScoredDoc(query_id='158491', doc_id='1880296', score=101.613235)
ScoredDoc(query_id='158491', doc_id='2261272', score=101.45615)
ScoredDoc(query_id='158491', doc_id='607552', score=90.771095)
ScoredDoc(query_id='158491', doc_id='1957435', score=90.58272)
ScoredDoc(query_id='158491', doc_id='1180246', score=89.26848)
ScoredDoc(query_id='158491', doc_id='685181', score=89.11139)
ScoredDoc(query_id='158491', doc_id='1093529', score=88.879684)
ScoredDoc(query_id='158491', doc_id='1158969', score=88.879684)
ScoredDoc(query_id='158491', doc_id='1397771', score=88.503624)
ScoredDoc(query_id='158491', doc_id='637819', score=88.503624)


In [24]:
run = ir_measures.read_trec_run('search_with_lemma.res')
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')

ir_measures.calc_aggregate([P@5, P@20, AP], qrels, run)


{P@20: 0.13249999999999998, AP: 0.12182953321922929, P@5: 0.24399999999999986}