In [1]:
from opensearchpy import OpenSearch
from langchain_openai import ChatOpenAI
from langchain.schema import (
    SystemMessage,
    HumanMessage
)
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
from datasets import Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["OPENAI_API_KEY"] = ""

<h>Infrastructure<h>

In [4]:
chat = ChatOpenAI(
    openai_api_key= "",
    model='gpt-3.5-turbo-0125'
)

embedding_model_name = 'intfloat/e5-base-v2'
embedding_model = SentenceTransformer(embedding_model_name)

In [5]:
#Initialize connection to opensearch
host = 'localhost'
port = 9200
auth = ('admin', 'admin') 

client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    timeout=100
)
#check status
print(client.info())



{'name': 'opensearch-node1_med', 'cluster_name': 'opensearch-cluster', 'cluster_uuid': 'DbPN66O6QdGNwomjb1ufRQ', 'version': {'distribution': 'opensearch', 'number': '2.11.1', 'build_type': 'tar', 'build_hash': '6b1986e964d440be9137eba1413015c31c5a7752', 'build_date': '2023-11-29T21:43:10.135035992Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}


<h>The pipeline IR: KNN<h>

In [7]:
def pipeline_knn(query):
    #Embedd query
    embedded_query = embedding_model.encode("query: "+ query)
    
    #Retrieve Data
    knn_search_body = {
    "query": {
        "knn": {
            "vector": {
                "vector": embedded_query,
                "k": 3
                }
            }
        }
    }
    response = client.search(index="med_data_strategy_3", body=knn_search_body)

    #Concatenate Data
    context = ""
    contextList = []
    hits = response['hits']['hits']
    for id, hit in enumerate(hits):
       source = hit['_source']
       context = context + f""" Context {id}: {source['text']}""" 
       contextList.append(source['text'])

    #Use GPT 3.5 Turbo 0125 to generate the answer
    messages = [
       SystemMessage(content="You are a friendly assistant that will answer questions"),
    ]
    augmented_prompt = f"""Answer the question with the given context.
    Question: {query}
    {context}
    """
    prompt = HumanMessage(
       content=augmented_prompt
    )
    messages.append(prompt)

    res = chat(messages)

    return {"Answer":  res.content, "Context": contextList}





In [6]:
def pipeline_bm25(query):

    #Retrieve Data
    k = 3

    text_search_body = {
        "size": k, 
        "explain": True,
        "query": {
            "match": {
                "text": query  
                }
            }
        }
    response = client.search(index="med_data_strategy_3", body=text_search_body)

    #Concatenate Data
    context = ""
    contextList = []
    hits = response['hits']['hits']
    for id, hit in enumerate(hits):
       source = hit['_source']
       context = context + f""" Context {id}: {source['text']}""" 
       contextList.append(source['text'])

    #Use GPT 3.5 Turbo 0125 to generate the answer
    messages = [
       SystemMessage(content="You are a friendly assistant that will answer questions"),
    ]
    augmented_prompt = f"""Answer the question with the given context.
    Contexts:
    Question: {query}
    {context}"""
    prompt = HumanMessage(
       content=augmented_prompt
    )
    messages.append(prompt)

    res = chat(messages)

    return {"Answer":  res.content, "Context": contextList}





<h>Load Question-Answerrespective pairs + context from file <h>

In [None]:
import json
with open("./data/QA_set.json", 'r',encoding='utf-8') as json_file:
    json_string = json_file.read()
    data = json.loads(json_string)

#[context, question, groundtruth]
dataset = []
for report in data["reports"]:
    for paragraph in report["paragraphs"]:
        for QA in paragraph["QA"]:
            dataset.append({"Context": paragraph["text"],"Question": QA["Question"],"Answer":QA["Answer"]})

In [34]:
import pandas as pd
dataset = []
df = pd.read_csv("./testset.csv")

for i in range(99):
    dataset.append({"Question": df["question"][i],"Context": df["ground_truth_context"][i],"Answer": df["ground_truth"][i]})

In [31]:
print(dataset[98])

{'Question': 'What is the connection between therapeutic angiogenesis and regeneration in the youngest organism?', 'Context': "['Intriguingly, the current trend is to consider regeneration as a prerogative of the youngest organism.\\nConsequentially, the embryonic and foetal models are attracting much attention for clinical translation into corrective modalities in the adulthood.\\nScientists seem to undervalue the lesson from Mother Nature, e.g. all humans are born young but very few achieve the goal of an exceptional healthy longevity.\\nEither natural experimentation is driven by a supreme intelligence or stochastic phenomena, one has to accept the evidence that healthy longevity is the fruit of an evolutionary process lasting million years.\\nIt is therefore extremely likely that results of this natural experimentation are more reliable and translatable than the intensive, but very short human investigation on mechanisms governing repair and regeneration.\\nWith this preamble in mi

Create Ragas dataset

In [35]:

def create_ragas_dataset(dataset):
    rag_dataset = []
    for QA_pair in tqdm(dataset):
        #Here we call the pipeline to generate answers
        result = pipeline_bm25(QA_pair["Question"])
        contextList = [chunk for chunk in result["Context"]]
        rag_dataset.append({
            "question" : QA_pair["Question"],
            "ground_truths" : [QA_pair["Answer"]],
            "answer" : result["Answer"],
            "contexts" : contextList,
        })
    rag_df = pd.DataFrame(rag_dataset)
    rag_eval_dataset = Dataset.from_pandas(rag_df)
    return rag_eval_dataset

In [36]:
basic_qa_ragas_dataset = create_ragas_dataset(dataset)

100%|██████████| 99/99 [07:00<00:00,  4.24s/it]


In [37]:
basic_qa_ragas_dataset

Dataset({
    features: ['question', 'ground_truths', 'answer', 'contexts'],
    num_rows: 99
})

In [38]:
#Store as Dataset
basic_qa_ragas_dataset.save_to_disk("./ragasData_BM25.parquet")

Saving the dataset (1/1 shards): 100%|██████████| 99/99 [00:00<00:00, 3820.65 examples/s]
