# BM25 Retriver for Open Question Answering for Pira

This Jupyter notebook evaluates the performance of BM25 retriever model on Pirá Dataset. 

The code is based on DPR Haystack library implementation: https://haystack.deepset.ai/overview/intro

Check the full Pira GitHub at: https://github.com/C4AI/Pira

## Imports

In [None]:
import pandas as pd
from haystack.utils import launch_es
import os
from subprocess import Popen, PIPE, STDOUT
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever
from haystack.pipelines import DocumentSearchPipeline
import ast

## Choosing tasks

DO_PREPROCESSING -> If set to True, the supporting texts are split into 100 words chunks.

PUSH_DOCUMENTS -> Push documents to ElasticSearch. It is only needed the first time.

SAVE_QUESTIONS_WITH_PASSAGES -> Save the questions with the passages retrieved by BM25.

DO_EVAL -> Performs a evaluation of the Retriever.

In [None]:
DO_PREPROCESSING = True
PUSH_DOCUMENTS = True
SAVE_QUESTIONS_WITH_PASSAGES = True
DO_EVAL = False


In [1]:

PATH_BASE = 'splitted_data/'
ABSTRACT_COLUMN = 18
ANSWER_COLUMN = 7
QUESTION_COLUMN = 3

INDEX_KNOWLEDGE_BASE = "abstracts_100_pt"
NUMBER_OF_WORDS = 100
NUMBER_OF_PASSAGES = 5

PATH_SAVE_BM25_EVAL = "Retriever_Results/BM25_results_Abstract_Translated.csv"

PATH_SAVE_QUESTIONS_BM25 = 'finetune_PT_PT_100Words_5Passages/' 



## Loading Dataset

It is important to ensure that we do not use the same supporting text more than once.

In [2]:

pira_train = pd.read_csv(PATH_BASE + "train.csv").values.tolist()
pira_val = pd.read_csv(PATH_BASE + "validation.csv").values.tolist()
pira_test = pd.read_csv(PATH_BASE + "test.csv").values.tolist()

pira_dataset = pira_train + pira_val + pira_test

abstracts = []
temp = []
for i in range(len(pira_dataset)):
    if pira_dataset[i][ABSTRACT_COLUMN] not in temp:
        abstracts.append([pira_dataset[i][ABSTRACT_COLUMN], len(abstracts)+1])
        temp.append(pira_dataset[i][ABSTRACT_COLUMN])
del temp 
 
for i in range(len(pira_dataset)):
    for j in range(len(abstracts)):
        if pira_dataset[i][ABSTRACT_COLUMN] == abstracts[j][0]:
            pira_dataset[i].append(abstracts[j][1])
            
dicts = []
for line in abstracts:
    dicts.append({'content' : line[0], 'meta' : {'idarticle': line[1]}})

In [None]:
if DO_PREPROCESSING:

    processor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=NUMBER_OF_WORDS,
        split_respect_sentence_boundary=True,
        split_overlap=0
    )
    docs = []
    cont=0

    for dict1 in dicts:
        if cont %100 ==0:
            print(cont)
        cont+=1
        doc = processor.process(dict1)
        docs = docs+doc
else:
    docs = dicts

## Initializing ElasticSearch

To Download ElasticSearch files, uncoment top lines

In [None]:
#! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
#! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
#! sudo chown -R daemon:daemon elasticsearch-7.9.2

launch_es()
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )

# wait until ES has started
! sleep 30

## Creating the document store and writing supporting documents

In [None]:
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index=INDEX_KNOWLEDGE_BASE)
if PUSH_DOCUMENTS:
    document_store.write_documents(docs,batch_size=1000)
#document_store.delete_all_documents # Deleting documents if needed

## Creating the Retriever Module

In [None]:
retriever = ElasticsearchRetriever(document_store=document_store)

pipeline = DocumentSearchPipeline(retriever=retriever)


## Testing the retriever

In [None]:
question = "O que é o Pré-Sal ?"


result = pipeline.run(
    query=question,
    params={
        "Retriever": {
            "top_k": 5,
        }
    }
)

result

## Evaluating BM25 performance for multiple k values

This function checks for each question if the supporting text was one of the top k retrieved documents and generates the accuracy.

In [None]:
if DO_EVAL:
    import ast

    def get_BM25_acc(questions, K_Values):
        maxK = max(K_Values)
        cont = 0
        ids = []
        for line in questions:
            ids.append([])
            result = pipeline.run(query=line[QUESTION_COLUMN],params={"Retriever": {"top_k": maxK}})
            document_dict = ast.literal_eval(str(result["documents"]).replace("<Document: ","").replace("'}>","'}"))
            for i in range(len(document_dict)):
                ids[cont].append(int(document_dict[i]["meta"]["idarticle"]))
            cont+=1
        corrects = []
        accuracies = []
        for j in range(len(K_Values)):
            corrects.append(0)
            cont = 0
            for line in questions:
                if int(line[len(line)-1]) in ids[cont][:K_Values[j]]:
                    corrects[j]+=1
                cont+=1
            accuracies.append(corrects[j]/len(questions))
        return(accuracies)

    accs = []

    Ks = range(1,101)
    pira_test2 = pd.DataFrame(pira_test)
    test = pira_test2.dropna(subset=[pira_test2.columns[QUESTION_COLUMN]]).values.tolist()
    accs = get_BM25_acc( test, Ks)
    for i in range(len(Ks)):
        print("accuracy for K = " + str(Ks[i]) + " -- is =" + str(accs[i]))
    
    df_accs = pd.DataFrame(accs)
    df_accs.to_csv(PATH_SAVE_BM25_EVAL)

## Saving Questions with Retrieved Passages for the Reader

In [None]:

if SAVE_QUESTIONS_WITH_PASSAGES:
    isExist = os.path.exists(PATH_SAVE_QUESTIONS_BM25)

    if not isExist:
        os.makedirs(PATH_SAVE_QUESTIONS_BM25)

    train_docs = [] 
    for i in range(len(pira_train)):
        result = pipeline.run(query=pira_train[i][QUESTION_COLUMN],params={"Retriever": {"top_k": NUMBER_OF_PASSAGES}})
        document_dict = ast.literal_eval(str(result["documents"]).replace("<Document: ","").replace("'}>","'}"))
        question = pira_train[i][QUESTION_COLUMN] + "  context:"
        for j in range(len(document_dict)):
            question += " " + document_dict[j]["content"]
        train_docs.append([question, pira_train[i][ANSWER_COLUMN]])
    train_df = pd.DataFrame(train_docs)
    train_df.to_csv(PATH_SAVE_QUESTIONS_BM25 + "train.csv")
    

    
    val_docs = [] 
    for i in range(len(pira_val)):
        result = pipeline.run(query=pira_val[i][QUESTION_COLUMN],params={"Retriever": {"top_k": NUMBER_OF_PASSAGES}})
        document_dict = ast.literal_eval(str(result["documents"]).replace("<Document: ","").replace("'}>","'}"))
        question = pira_val[i][QUESTION_COLUMN] + "  context:"
        for j in range(len(document_dict)):
            question += " " + document_dict[j]["content"]
        val_docs.append([question, pira_val[i][ANSWER_COLUMN]])
    val_df = pd.DataFrame(val_docs)
    val_df.to_csv(PATH_SAVE_QUESTIONS_BM25 + "val.csv")



    test_docs = [] 
    for i in range(len(pira_test)):
        result = pipeline.run(query=pira_test[i][QUESTION_COLUMN],params={"Retriever": {"top_k": NUMBER_OF_PASSAGES}})
        document_dict = ast.literal_eval(str(result["documents"]).replace("<Document: ","").replace("'}>","'}"))
        question = pira_test[i][QUESTION_COLUMN] + "  context:"
        for j in range(len(document_dict)):
            question += " " + document_dict[j]["content"]
        test_docs.append([question, pira_test[i][ANSWER_COLUMN]])
    test_df = pd.DataFrame(test_docs)
    test_df.to_csv(PATH_SAVE_QUESTIONS_BM25 + "test.csv")


    extractive_docs = [] 
    for i in range(len(pira_test)):
        result = pipeline.run(query = pira_test[i][QUESTION_COLUMN],params={"Retriever": {"top_k": NUMBER_OF_PASSAGES}})
        document_dict = ast.literal_eval(str(result["documents"]).replace("<Document: ","").replace("'}>","'}"))
        context = document_dict[0]["content"]
        for j in range(1, len(document_dict)):
            context += " " + document_dict[j]["content"]
        extractive_docs.append([pira_test[i][QUESTION_COLUMN], pira_test[i][ANSWER_COLUMN], context])
    extractive_df = pd.DataFrame(extractive_docs)
    extractive_df.to_csv(PATH_SAVE_QUESTIONS_BM25 + "extractive.csv")


In [None]:
test_docs

In [None]:
pira_test

In [None]:
pira_test[0][ANSWER_COLUMN]