In [1]:
import json
 
# Opening JSON file
f = open('all_text_ans.json')
 
# returns JSON object as 
# a dictionary
data = json.load(f)

In [2]:
for k,v in data.items():
    if len(v.split("<END>")) != 1001:
        print(k)

## BUILD VECTOR DATABASE


In [3]:
import os
os.chdir("mod_hyde")

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document

def split_text_langchain(text):
    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        # separators = [". "],
        chunk_size=2000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False)

    splitted_text = text_splitter.split_text(text)
    return splitted_text

pubmed_documents = []

with open("data/pubmed_subset.txt","r") as f:
    pubmed_text = f.read()
pubmed_documents.extend([Document(page_content=pt,metadata={"source":"pubmed"})for pt in pubmed_text.split("<END>")[:-1]])

wikipedia_documents = []
with open("data/wikipedia_filtered.txt","r") as f:
    wiki_data = f.read()
wiki_data_list = wiki_data.split("-----------")[:-1]
wiki_data_list = [at.strip() for at in wiki_data_list]
all_text_list = []
for text in wiki_data_list:
    splitted_text = split_text_langchain(text)
    all_text_list.extend(splitted_text)
wikipedia_documents.extend([Document(page_content=wt,metadata={"source":"wiki"})for wt in all_text_list])

In [6]:
# dataset_name, decoding_type,format_or_not,rank, checkpoint
from langchain_community.retrievers import BM25Retriever

TOP_K = 2
pubmed_bm25_retriever = BM25Retriever.from_documents(pubmed_documents,k=TOP_K)
# pubmed_bm25_docs = pubmed_bm25_retriever.invoke(question.lower())

wiki_bm25_retriever = BM25Retriever.from_documents(wikipedia_documents,k=TOP_K)
# wiki_bm25_docs = wiki_bm25_retriever.invoke(question.lower())

In [7]:
def get_questions(path):
    with open(path,"r") as f:
        questions = f.readlines()
    return questions

In [8]:
pubmed_formatted_questions = get_questions("data/pubmed_formatted_qs.txt")
pubmed_questions = get_questions("data/pubmed_qs.txt")

wikipedia_formatted_question = get_questions("data/wikipedia_formatted_qs.txt")
wikipedia_questions = get_questions("data/wikipedia_qs.txt")
pubmed_checkpoints = ['561','1122','1683']
wikipedia_checkpoints = ['831','1662','2493']

In [9]:
def question_answer_pair(dataset_name:str,decoding_type:str,format_or_not:bool,checkpoint:str,rank:str):
    assert dataset_name in ['pubmed','wikipedia'], "Dataset name must be pubmed or wikipedia"
    assert decoding_type in ['greedy','nucleus'], "Decoding type must be nucleus or greedy"
    assert format_or_not in [True,False], "Format or not must be True or False"
    if dataset_name == 'pubmed':
        assert checkpoint in pubmed_checkpoints, "The valid checkpoints for Pubmed are ['561','1122','1683']"
    elif dataset_name == 'wikipedia':
        assert checkpoint in wikipedia_checkpoints, "The valid checkpoints for Wikipedia are ['831','1662','2493']"
    assert rank in ['8','32','128']
    for key,text in data.items():
        if format_or_not:
            if "formatted" in key:
                if decoding_type in key and checkpoint in key and f"_{rank}_" in key:
                    if dataset_name  == "pubmed":
                        answers_list = text.split("<END>")[:-1]
                        assert len(pubmed_formatted_questions) == len(answers_list), f"Pubmed: {len(pubmed_formatted_questions)} != {len(answers_list)}"
                        return zip(pubmed_formatted_questions,answers_list)
                    if dataset_name  == "wikipedia":
                        answers_list = text.split("<END>")[:-1]
                        assert len(wikipedia_formatted_question) == len(answers_list), f"Pubmed: {len(wikipedia_formatted_question)} != {len(answers_list)}"
                        return zip(wikipedia_formatted_question,answers_list)
        elif not format_or_not:
                if decoding_type in key and checkpoint in key and f"_{rank}_" in key:
                    if dataset_name  == "pubmed":
                        answers_list = text.split("<END>")[:-1]
                        assert len(pubmed_questions) == len(answers_list), f"Pubmed: {len(pubmed_questions)} != {len(answers_list)}"
                        return zip(pubmed_questions,answers_list)
                    if dataset_name  == "wikipedia":
                        answers_list = text.split("<END>")[:-1]
                        assert len(wikipedia_questions) == len(answers_list), f"Pubmed: {len(wikipedia_questions)} != {len(answers_list)}"
                        return zip(wikipedia_questions,answers_list)

In [10]:
zipped_elem = question_answer_pair("pubmed","nucleus",False,'1122','128')
for q,a in zipped_elem:
    print(q)
    print(a)
    pubmed_bm25_docs = pubmed_bm25_retriever.invoke(a.lower())
    wiki_bm25_docs = wiki_bm25_retriever.invoke(a)
    print(pubmed_bm25_docs)
    print(wiki_bm25_docs)
    break

Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?

We examined whether mitochondria can control the programmed cell death of leaf fibers, using microsurgical surgical techniques and tissue culture. We collected 107 fresh leaves from various plants of the lace plant, which were then cut, fixed and placed on a cover sheet. Sixty of the leaves were used in in situ cell death assay (in vitro) under standard circumstances and sixteen were used in live plant culture. Mitochondria were detected with the flow cytometry technique. The

[Document(page_content='Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the 

## PUBMED EVALUATION

In [63]:
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}

In [12]:
import dspy
from dotenv import load_dotenv

load_dotenv(override=True)
llm = dspy.OpenAI(model="gpt-3.5-turbo-0125",max_tokens = 1000)
dspy.settings.configure(lm=llm)

In [13]:
class PubMedGenerateAnswer(dspy.Signature):
    """Answer questions in detail based on the context."""
    
    context = dspy.InputField(prefix="Relevant facts: ",desc="may contain relevant facts")
    question = dspy.InputField(prefix="Question: ")
    answer = dspy.OutputField(prefix="Answer",desc="answer in detail from the provided context")
    result: bool = dspy.OutputField(prefix="Result: ",desc="Answer to the question from either yes, no or maybe.")

class WikipediaGenerateAnswer(dspy.Signature):
    """Answer questions in detail based on the context."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField(desc="question that needs to be answered")
    answer = dspy.OutputField(desc="answer in detail from the provided context")

In [23]:
os.environ['DSP_CACHEBOOL'] = 'false'

In [48]:
class RAG:
    def __init__(self,dataset_name:str,decoding_type:str,format_or_not,checkpoint,rank) -> None:
        self.zipped_elem = question_answer_pair(dataset_name,decoding_type,format_or_not,checkpoint,rank)
        if dataset_name == "pubmed":
            self.retriever = pubmed_bm25_retriever
            self.LLManswer = dspy.Predict(PubMedGenerateAnswer)
        elif dataset_name == "wikipedia":
            self.retriever = wiki_bm25_retriever
            self.LLManswer = dspy.Predict(WikipediaGenerateAnswer)

    def __call__(self,*args,**kwargs):
        return self.forward(*args,**kwargs)
    
    def forward(self):
        
        for ques,mod_hyde_ans in self.zipped_elem:
            print(ques)
            print(mod_hyde_ans)
            bm25_docs = pubmed_bm25_retriever.invoke(mod_hyde_ans)
            context = " ".join([ctx.page_content for ctx in bm25_docs])
            final_answer = self.LLManswer(context=context,question=ques)
            i+=1
        return final_answer

In [49]:
rag = RAG("wikipedia","nucleus",True,'1662','32')

In [50]:
ans = rag()

airport is located in Maine, Sacramento International Airport or Knox County Regional Airport

. The airport is served by US Airways and United Airlines; it is used by 30 airlines worldwide and with an average in flight schedule of over 400 flights daily.
In 2003, the International Airport Authority of San Francisco voted to add a third runway to San Francisco International Airport with an extension, approximately 22 miles (35 km) long, to the existing runway I-510. The proposal was approved by the Board

Peter Hobbs founded the company that is based in what town in Manchester


?
He is best known for being a bass guitar player for the rock band The Clash in 1981. The Clash were formed in London in 1977, led by singer Billy Gibbons and guitarist Mick Jagger. They were known for their progressive rock sound and later, their punk and grunge-influenced sound. The Clash were among the pioneers of the British new wave music. They formed a quartet in 1980 that was



In [51]:
ans

Prediction(
    answer="I'm sorry, but the provided context does not contain any information related to Peter Hobbs founding a company based in a town in Manchester."
)