## This following link helped me create this notebook
https://github.com/psorianom/DPR/blob/master/dpr2hf/squad2dpr.py

### Note: this notebook was run locally

In [1]:
import json

with open('./data/sharc data/sharc1-official/json/sharc_train.json') as json_file:
    data = json.load(json_file)

In [2]:
#data

In [3]:
# inspecting a data sample
data[69]

{'utterance_id': '00c6a8465597d51df14e690bf72da377b2e2f68f',
 'tree_id': 'da6ff4de766c67f108c370a4f837c78b9889739c',
 'source_url': 'https://www.benefits.gov/benefits/benefit-details/347',
 'snippet': 'Eligible applicants may obtain direct loans for up to a maximum indebtedness of $300,000, and guaranteed loans for up to a maximum indebtedness of $1,392,000 (amount adjusted annually for inflation).',
 'question': 'Does this loan meet my needs?',
 'scenario': 'I doe not need a direct loan and needs to borrow $2,000,000. ',
 'answer': 'No',
 'history': [{'follow_up_question': 'Do you need a direct loan?',
   'follow_up_answer': 'No'},
  {'follow_up_question': 'Do you need to borrow more than $1,392,000?',
   'follow_up_answer': 'Yes'}],
 'evidence': []}

In [4]:
len(data)

21890

# Putting sharc data in quac format

In [None]:
quac_format= {
    'data':[{
        'paragraphs':[{
            'context': str,
            'qas': [{
                'followup': str,
                'yesno': str,
                'question': str ,
                'answers': str,
                'id': ,
                'orig_answer':
            }],
            'id':
        }] ,
        'section_title': ,
        'background': ,
        'title': 
    }]
}

In [5]:
quac_formated= []
for utterance in data:
    
    quac_formated.append(
        {
            'paragraphs': [{
                'context': utterance['snippet'],
                'qas':[{
                    'followup': '',
                    'yesno': '',
                    'question': utterance['question'],
                    'answers': [{
                        'text':  utterance['answer'], 
                        'answer_start':''
                    }],
                    'id': '',
                    'orig_answer': ''
                }],
                'id':''
            }],
            'section_title': '',
            'background': '',
            'title': ''
        }
    )

In [6]:
len(quac_formated)

21890

In [7]:
quac_formated_data= {
    'data': quac_formated
}

# Sharc DPR 

In [8]:
import requests

In [9]:
requests.get("http://localhost:9200/_cluster/health").json()

{'cluster_name': 'elasticsearch',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'active_primary_shards': 4,
 'active_shards': 4,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 3,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 57.14285714285714}

In [10]:
from time import sleep

from typing import List, Dict, Iterator

from elasticsearch import Elasticsearch

from haystack.retriever.sparse import ElasticsearchRetriever
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore

from tqdm import tqdm
import json
import random
import re



In [11]:
def convert_quac_to_dicts(quac_data:dict):
    """Puts ctxs in format to be stored as ElasticsearchDocumentStore object."""

    documents = []
    for article in quac_data:
        article_title = article["title"]
        for para_idx, paragraph in enumerate(article["paragraphs"]):
            context = paragraph["context"]
            documents.append({"text": context, "meta": {"name": f"{article_title}_{para_idx}"}})
    return documents


In [12]:
def launch_and_index_es(documents_dicts: List):
    """stores Docs in a document store and instantiates retriever."""

    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
    document_store.write_documents(documents_dicts)
    retriever = ElasticsearchRetriever(document_store=document_store)
    return retriever

In [13]:
def prepare_es_retrieval(quac_data: Dict):
    documents = convert_quac_to_dicts(quac_data=quac_data)
    return launch_and_index_es(documents)

In [14]:
def get_hard_negative_context(retriever: ElasticsearchRetriever, question: str, positive_ctxs: str,
                              n_ctxs: int = 10, n_chars: int = 600):
    """ a function to generate Hard negative ctxs."""
    
    list_hard_neg_ctxs = []
    retrieved_docs = retriever.retrieve(query=question, top_k=n_ctxs, index="document")
    for retrieved_doc in retrieved_docs:
        retrieved_doc_id = retrieved_doc.meta["name"]
        retrieved_doc_text = retrieved_doc.text
        if positive_ctxs.lower() in retrieved_doc_text.lower():
            continue
        list_hard_neg_ctxs.append({"title": retrieved_doc_id, "text": retrieved_doc_text[:n_chars]})

    return list_hard_neg_ctxs

In [15]:
def create_dpr_training_dataset(quac_data: list,  quac_data_version: str):
    """ a function to generate DPR formated data"""
    
    version = quac_data_version
    quac_data = quac_data
    
    retriever = prepare_es_retrieval(quac_data=quac_data)
    random.shuffle(quac_data)
    
    list_DPR = []

    for idx_article, article in enumerate(tqdm(quac_data[:], unit="article")):
        article_title = article["title"]
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for question in paragraph["qas"]:
                answers = [a["text"] for a in question["answers"]]

                positive_ctxs = [{
                    "title": article_title,
                    "text": context
                }]
                
                hard_negative_ctxs = get_hard_negative_context(retriever=retriever,
                                                               question=question["question"],
                                                               positive_ctxs=positive_ctxs[0]['text'],
                                                               n_ctxs=10)                
                
                dict_DPR = {
                    "question": question["question"],
                    "answers": answers,
                    "positive_ctxs": positive_ctxs,
                    "negative_ctxs": [],
                    "hard_negative_ctxs": hard_negative_ctxs
                }
                list_DPR.append(dict_DPR)
                
    return list_DPR

In [None]:
dpr_data= create_dpr_training_dataset(quac_data= quac_formated_data['data'], quac_data_version= 'bla')

In [None]:
# Inspecting data instance
dpr_data[4]

In [None]:
len(dpr_data)

# saving data to a file

In [None]:
file = open("DPR_formated_Sharc.json", "w")
json.dump(dpr_data, file)
file.close()