In [None]:
import os
import json

import pandas as pd
from tqdm.auto import tqdm
from haystack.utils.doc_store import launch_es, stop_elasticsearch
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import ElasticsearchRetriever
from haystack.pipelines import DocumentSearchPipeline

In [None]:
dataset_path = 'data/poleval-passage-retrieval/allegro-faq/'

## Lauch ElasticSearch in Docker

In [None]:
launch_es(sleep=20)

In [None]:
document_store = ElasticsearchDocumentStore(host='localhost', username='', password='', index='document')

## Index Passages

In [None]:
passages = pd.read_json(
    os.path.join(dataset_path, 'passages.jl'), 
    lines=True, 
    chunksize=1e6, 
)

for batch in tqdm(passages):
    if 'title' in batch:
        batch['title'] = batch['title'].fillna('')
        batch['text'] = batch.apply(lambda r: r['title'] + ' ' + r['text'], axis=1)
    
    batch = batch.rename(columns={'id': 'passage-id', 'text': 'content'})
    batch_as_dicts = batch.to_dict(orient='records')
    document_store.write_documents(batch_as_dicts)

In [None]:
document_store.get_document_count()

## Retrieve Similar Passages

In [None]:
retriever = ElasticsearchRetriever(document_store=document_store)
pipe = DocumentSearchPipeline(retriever)

In [None]:
questions = pd.read_json(os.path.join(dataset_path, 'questions-test.jl'), lines=True)
questions.shape

In [None]:
preds = []

for _, row in tqdm(questions.iterrows()):
    top_passages = pipe.run(
        query=row['text'],
        params={'Retriever': {'top_k': 10}}
    )
    
    for passage in top_passages['documents']:
        passage = passage.to_dict()
        preds.append({
            'question-id': row['id'],
            'passage-id': passage['meta']['passage-id'],
            'score': passage['score'],
        })

preds = pd.DataFrame(preds)

In [None]:
preds.to_csv(os.path.join(dataset_path, 'submission.tsv'), sep='\t', index=False)

In [None]:
stop_elasticsearch(delete_container=True)