## This following link helped me create this notebook
https://github.com/psorianom/DPR/blob/master/dpr2hf/squad2dpr.py

### Note: this notebook was run locally

In [1]:
import requests

In [2]:
requests.get("http://localhost:9200/_cluster/health").json()

{'cluster_name': 'elasticsearch',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'active_primary_shards': 4,
 'active_shards': 4,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 3,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 57.14285714285714}

# Imports

In [1]:
from time import sleep

from typing import List, Dict, Iterator

from elasticsearch import Elasticsearch

from haystack.retriever.sparse import ElasticsearchRetriever
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore

from tqdm import tqdm
import json
import random
import re



In [2]:
import json

with open('./data/quacdata/train_v0.2.json') as json_file:
    data = json.load(json_file)

In [3]:
quac_data_= data['data']

In [4]:
quac_data_version= '0.2'

# Prepairing data

In [5]:
def convert_quac_to_dicts(quac_data:dict):
    """Puts ctxs in format to be stored as ElasticsearchDocumentStore object."""

    documents = []
    for article in quac_data:
        article_title = article["title"]
        for para_idx, paragraph in enumerate(article["paragraphs"]):
            context = paragraph["context"]
            documents.append({"text": context, "meta": {"name": f"{article_title}_{para_idx}"}})
    return documents


In [6]:
documents= convert_quac_to_dicts(quac_data_)

In [None]:
#documents

In [7]:
def launch_and_index_es(documents_dicts: List):
    """stores Docs in a document store and instantiates retriever."""
    
    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
    document_store.write_documents(documents_dicts)
    retriever = ElasticsearchRetriever(document_store=document_store)
    return retriever

In [8]:
retriever= launch_and_index_es(documents)

12/08/2021 15:54:54 - INFO - elasticsearch -   HEAD http://localhost:9200/ [status:200 request:0.004s]
12/08/2021 15:54:54 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.001s]
12/08/2021 15:54:54 - INFO - elasticsearch -   GET http://localhost:9200/document [status:200 request:0.000s]
12/08/2021 15:54:54 - INFO - elasticsearch -   PUT http://localhost:9200/document/_mapping [status:200 request:0.004s]
12/08/2021 15:54:54 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.001s]
12/08/2021 15:54:56 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.044s]
12/08/2021 15:54:57 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.065s]
12/08/2021 15:54:58 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.044s]
12/08/2021 15:54:59 - INFO - elasticsearch -   POST http://localhost:9200/

In [None]:
def prepare_es_retrieval(quac_data: Dict):
    documents = convert_quac_to_dicts(quac_data=quac_data)
    return launch_and_index_es(documents)

In [None]:
def get_hard_negative_context(retriever: ElasticsearchRetriever, question: str, answer: str, 
                              n_ctxs: int = 10, n_chars: int = 600):
    """ a function to generate Hard negative ctxs."""
    list_hard_neg_ctxs = []
    retrieved_docs = retriever.retrieve(query=question, top_k=n_ctxs, index="document")
    for retrieved_doc in retrieved_docs:
        retrieved_doc_id = retrieved_doc.meta["name"]
        retrieved_doc_text = retrieved_doc.text
        if answer.lower() in retrieved_doc_text.lower():
            continue
        list_hard_neg_ctxs.append({"title": retrieved_doc_id, "text": retrieved_doc_text[:n_chars]})

    return list_hard_neg_ctxs

In [None]:
def create_dpr_training_dataset(quac_data: list,  quac_data_version: str):
    """ a function to generate DPR formated data"""
    version = quac_data_version
    quac_data = quac_data
    
    retriever = prepare_es_retrieval(quac_data=quac_data)
    random.shuffle(quac_data)
    
    list_DPR = []

    for idx_article, article in enumerate(tqdm(quac_data[:], unit="article")):
        article_title = article["title"]
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for question in paragraph["qas"]:
                answers = [a["text"] for a in question["answers"]]
                hard_negative_ctxs = get_hard_negative_context(retriever=retriever,
                                                               question=question["question"],
                                                               answer=answers[0],
                                                               n_ctxs=10)
                positive_ctxs = [{
                    "title": article_title,
                    "text": context
                }]
                dict_DPR = {
                    "question": question["question"],
                    "answers": answers,
                    "positive_ctxs": positive_ctxs,
                    "negative_ctxs": [],
                    "hard_negative_ctxs": hard_negative_ctxs
                }
                list_DPR.append(dict_DPR)
                
    return list_DPR

In [None]:
dpr_data= create_dpr_training_dataset(quac_data= quac_data_, quac_data_version= quac_data_version)

In [None]:
dpr_data[0].keys()

In [None]:
dpr_data[0]['question']

In [None]:
dpr_data[0]['answers']

In [None]:
dpr_data[0]['positive_ctxs']

In [None]:
dpr_data[0]['hard_negative_ctxs']

In [None]:
# saving the data in a file
file = open("DPR_formated.json", "w")
json.dump(dpr_data, file)
file.close()