In [1]:
import json
 
# Opening JSON file
f = open('all_text_ans.json')
 
# returns JSON object as 
# a dictionary
data = json.load(f)

In [2]:
for k,v in data.items():
    if len(v.split("<END>")) != 1001:
        print(k)

## BUILD VECTOR DATABASE


In [3]:
import os
os.chdir("mod_hyde")

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document

def split_text_langchain(text):
    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        # separators = [". "],
        chunk_size=2000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False)

    splitted_text = text_splitter.split_text(text)
    return splitted_text

pubmed_documents = []

with open("data/pubmed_subset.txt","r") as f:
    pubmed_text = f.read()
pubmed_documents.extend([Document(page_content=pt,metadata={"source":"pubmed"})for pt in pubmed_text.split("<END>")[:-1]])

wikipedia_documents = []
with open("data/wikipedia_filtered.txt","r") as f:
    wiki_data = f.read()
wiki_data_list = wiki_data.split("-----------")[:-1]
wiki_data_list = [at.strip() for at in wiki_data_list]
all_text_list = []
for text in wiki_data_list:
    splitted_text = split_text_langchain(text)
    all_text_list.extend(splitted_text)
wikipedia_documents.extend([Document(page_content=wt,metadata={"source":"wiki"})for wt in all_text_list])

In [5]:
# dataset_name, decoding_type,format_or_not,rank, checkpoint
from langchain_community.retrievers import BM25Retriever

TOP_K = 2
pubmed_bm25_retriever = BM25Retriever.from_documents(pubmed_documents,k=TOP_K)
# pubmed_bm25_docs = pubmed_bm25_retriever.invoke(question.lower())

wiki_bm25_retriever = BM25Retriever.from_documents(wikipedia_documents,k=TOP_K)
# wiki_bm25_docs = wiki_bm25_retriever.invoke(question.lower())

In [6]:
def get_questions(path):
    with open(path,"r") as f:
        questions = f.readlines()
    return questions

In [7]:
pubmed_formatted_questions = get_questions("data/pubmed_formatted_qs.txt")
pubmed_questions = get_questions("data/pubmed_qs.txt")

wikipedia_formatted_question = get_questions("data/wikipedia_formatted_qs.txt")
wikipedia_questions = get_questions("data/wikipedia_qs.txt")
pubmed_checkpoints = ['561','1122','1683']
wikipedia_checkpoints = ['831','1662','2493']

In [8]:
def question_answer_pair(dataset_name:str,decoding_type:str,format_or_not:bool,checkpoint:str,rank:str):
    assert dataset_name in ['pubmed','wikipedia'], "Dataset name must be pubmed or wikipedia"
    assert decoding_type in ['greedy','nucleus'], "Decoding type must be nucleus or greedy"
    assert format_or_not in [True,False], "Format or not must be True or False"
    if dataset_name == 'pubmed':
        assert checkpoint in pubmed_checkpoints, "The valid checkpoints for Pubmed are ['561','1122','1683']"
    elif dataset_name == 'wikipedia':
        assert checkpoint in wikipedia_checkpoints, "The valid checkpoints for Wikipedia are ['831','1662','2493']"
    assert rank in ['8','32','128']
    for key,text in data.items():
        if format_or_not:
            if "formatted" in key:
                if decoding_type in key and checkpoint in key and f"_{rank}_" in key:
                    if dataset_name  == "pubmed":
                        answers_list = text.split("<END>")[:-1]
                        assert len(pubmed_formatted_questions) == len(answers_list), f"Pubmed: {len(pubmed_formatted_questions)} != {len(answers_list)}"
                        return zip(pubmed_formatted_questions,answers_list)
                    if dataset_name  == "wikipedia":
                        answers_list = text.split("<END>")[:-1]
                        assert len(wikipedia_formatted_question) == len(answers_list), f"Pubmed: {len(wikipedia_formatted_question)} != {len(answers_list)}"
                        return zip(wikipedia_formatted_question,answers_list)
        elif not format_or_not:
                if decoding_type in key and checkpoint in key and f"_{rank}_" in key:
                    if dataset_name  == "pubmed":
                        answers_list = text.split("<END>")[:-1]
                        assert len(pubmed_questions) == len(answers_list), f"Pubmed: {len(pubmed_questions)} != {len(answers_list)}"
                        return zip(pubmed_questions,answers_list)
                    if dataset_name  == "wikipedia":
                        answers_list = text.split("<END>")[:-1]
                        assert len(wikipedia_questions) == len(answers_list), f"Pubmed: {len(wikipedia_questions)} != {len(answers_list)}"
                        return zip(wikipedia_questions,answers_list)

In [9]:
zipped_elem = question_answer_pair("pubmed","nucleus",False,'1122','128')
for q,a in zipped_elem:
    print(q)
    print(a)
    pubmed_bm25_docs = pubmed_bm25_retriever.invoke(a.lower())
    wiki_bm25_docs = wiki_bm25_retriever.invoke(a)
    print(pubmed_bm25_docs)
    print(wiki_bm25_docs)
    break

Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?

We examined whether mitochondria can control the programmed cell death of leaf fibers, using microsurgical surgical techniques and tissue culture. We collected 107 fresh leaves from various plants of the lace plant, which were then cut, fixed and placed on a cover sheet. Sixty of the leaves were used in in situ cell death assay (in vitro) under standard circumstances and sixteen were used in live plant culture. Mitochondria were detected with the flow cytometry technique. The

[Document(page_content='Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the 

## PUBMED EVALUATION

In [63]:
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}

In [11]:
import dspy
from dotenv import load_dotenv

load_dotenv(override=True)
llm = dspy.OpenAI(model="gpt-3.5-turbo-0125",max_tokens = 1000)
dspy.settings.configure(lm=llm)

## COLLECT ANSWERS

#### PUBMED ANSWERS

In [23]:
from datasets import load_dataset

pubmed_data = load_dataset("qiaojin/PubMedQA","pqa_labeled")

In [25]:
pubmed_long_answer = []
pubmed_final_decision = []

for pubmed_train_data in pubmed_data['train']:
    pubmed_long_answer.append(pubmed_train_data['long_answer'])
    pubmed_final_decision.append(pubmed_train_data['final_decision'])

In [27]:
len(pubmed_long_answer),len(pubmed_final_decision)

(1000, 1000)

## WIKIPEDIA ANSWERS

In [28]:
import pandas as pd

wiki_df = pd.read_csv("data/wikipedia_data.csv")
wiki_df['len_count'] = wiki_df.apply(lambda row: len(str(row.text).split(" ")),axis=1)
def gte_100_words(row):
    if row['len_count'] >= 100:
        return True
    else:
        return False

wiki_df = wiki_df[wiki_df.apply(gte_100_words,axis=1)]

In [29]:
hotpot_qa = load_dataset("hotpot_qa","distractor")
shuffled_dataset = hotpot_qa['train'].shuffle(42)
subset_data = shuffled_dataset[0:5000]

Downloading builder script: 100%|██████████| 6.42k/6.42k [00:00<00:00, 18.8MB/s]
Downloading readme: 100%|██████████| 9.19k/9.19k [00:00<00:00, 19.0MB/s]
Downloading data: 100%|██████████| 566M/566M [02:24<00:00, 3.91MB/s]
Downloading data: 100%|██████████| 46.3M/46.3M [00:12<00:00, 3.76MB/s]
Downloading data files: 100%|██████████| 2/2 [02:38<00:00, 79.24s/it] 
Generating train split: 100%|██████████| 90447/90447 [00:24<00:00, 3728.53 examples/s]
Generating validation split: 100%|██████████| 7405/7405 [00:01<00:00, 4859.92 examples/s]
  table = cls._concat_blocks(blocks, axis=0)


In [38]:
subset_data.keys()

dict_keys(['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'])

In [39]:
val_subset = subset_data['supporting_facts']
titles = []
for sd in val_subset:
    titles.append(sd['title'])
val_questions = subset_data['question']
val_answers = subset_data['answer']
val_context = subset_data['context']

In [40]:
val_wiki_list = list(wiki_df['title'].values)
question_not_there = []
question_ther = []
for idx,title in enumerate(titles):
    for t in title:
        if t not in val_wiki_list:
            question_not_there.append(idx)
            continue

In [42]:
question_there_idx = [id for id in range(len(val_questions)) if id not in question_not_there]

In [44]:
wikipedia_answers_list = []
wikipedia_context_list = []
for t in question_there_idx[:1000]:
    wikipedia_answers_list.append(val_answers[t])
    wikipedia_context_list.append(val_context[t])

## EVALUATION

In [51]:
os.environ['DSP_CACHEBOOL'] = 'false'

In [87]:
class PubMedGenerateAnswer(dspy.Signature):
    """Answer questions in detail based on the context. If the context does not answer the question, then say that you don't know"""
    
    context = dspy.InputField(prefix="Relevant facts: ",desc="may contain relevant facts")
    question = dspy.InputField(prefix="Question: ")
    answer = dspy.OutputField(prefix="Answer",desc="answer in detail from the provided context")
    result: bool = dspy.OutputField(prefix="Result: ",desc="Answer to the question from either yes, no, maybe or don't know.")

class WikipediaGenerateAnswer(dspy.Signature):
    """Answer the factual questions based on the context.
    Don't output anything else, just output the answer.
    If the context does not answer the question, then say that you don't know"""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField(desc="question that needs to be answered")
    answer = dspy.OutputField(desc="answer from the provided context")

In [88]:
class RAG:
    def __init__(self,dataset_name:str,decoding_type:str,format_or_not,checkpoint,rank) -> None:
        self.zipped_elem = question_answer_pair(dataset_name,decoding_type,format_or_not,checkpoint,rank)
        if dataset_name == "pubmed":
            self.retriever = pubmed_bm25_retriever
            self.LLManswer = dspy.Predict(PubMedGenerateAnswer)
            self.question_list = pubmed_questions
            self.answers_list = pubmed_final_decision
        elif dataset_name == "wikipedia":
            self.retriever = wiki_bm25_retriever
            self.LLManswer = dspy.Predict(WikipediaGenerateAnswer)
            self.question_list = wikipedia_questions
            self.answers_list = wikipedia_answers_list
        self.format_or_not = format_or_not
    def __call__(self,*args,**kwargs):
        return self.forward(*args,**kwargs)
    
    def forward(self):
        ques_idx = 0
        for ques,mod_hyde_ans in self.zipped_elem:
            # print(ques)
            # print(mod_hyde_ans)
            bm25_docs = self.retriever.invoke(mod_hyde_ans)
            context = " ".join([ctx.page_content for ctx in bm25_docs])
            if self.format_or_not:
                ques = self.question_list[ques_idx]
            final_answer = self.LLManswer(context=context,question=ques)
            print(f"\033[92m{self.answers_list[ques_idx]}\033[0m")
            print(f"\033[91m{final_answer.result}\033[0m")
            print('-'*10)
            ques_idx+=1
        return final_answer

rag = RAG("pubmed","greedy",True,'561','32')
ans = rag()

[92myes[0m
[91mYes[0m
----------
[92mno[0m
[91mYes[0m
----------


KeyboardInterrupt: 

In [89]:
llm.inspect_history(n=5)




Answer questions in detail based on the context. If the context does not answer the question, then say that you don't know

---

Follow the following format.

Relevant facts: may contain relevant facts

Question: ${question}

Result: Answer to the question from either yes, no or maybe.

Answer answer in detail from the provided context

---

Relevant facts:

Assessment of visual acuity depends on the optotypes used for measurement. The ability to recognize different optotypes differs even if their critical details appear under the same visual angle. Since optotypes are evaluated on individuals with good visual acuity and without eye disorders, differences in the lower visual acuity range cannot be excluded. In this study, visual acuity measured with the Snellen E was compared to the Landolt C acuity. 100 patients (age 8 - 90 years, median 60.5 years) with various eye disorders, among them 39 with amblyopia due to strabismus, and 13 healthy volunteers were tested. Charts with the Sne

"\n\n\nAnswer questions in detail based on the context. If the context does not answer the question, then say that you don't know\n\n---\n\nFollow the following format.\n\nRelevant facts: may contain relevant facts\n\nQuestion: ${question}\n\nResult: Answer to the question from either yes, no or maybe.\n\nAnswer answer in detail from the provided context\n\n---\n\nRelevant facts:\n\nAssessment of visual acuity depends on the optotypes used for measurement. The ability to recognize different optotypes differs even if their critical details appear under the same visual angle. Since optotypes are evaluated on individuals with good visual acuity and without eye disorders, differences in the lower visual acuity range cannot be excluded. In this study, visual acuity measured with the Snellen E was compared to the Landolt C acuity. 100 patients (age 8 - 90 years, median 60.5 years) with various eye disorders, among them 39 with amblyopia due to strabismus, and 13 healthy volunteers were teste

In [79]:
import ast
d = {"duration": 0.889585018157959, "input_args": {"**": "{'stringify_request': '{\"temperature\": 0.0, \"max_tokens\": 1000, \"top_p\": 1, \"frequency_penalty\": 0, \"presence_penalty\": 0, \"n\": 1, \"model\": \"gpt-3.5-turbo-0125\", \"messages\": [{\"role\": \"user\", \"content\": \"Answer the factual questions based on the context.\\\\n    Don\\'t output anything else, just output the answer.\\\\n    If the context does not answer the question, then say that you don\\'t know\\\\n\\\\n---\\\\n\\\\nFollow the following format.\\\\n\\\\nContext: may contain relevant facts\\\\nQuestion: question that needs to be answered\\\\nAnswer: answer from the provided context\\\\n\\\\n---\\\\n\\\\nContext:\\\\nWith the increasing numbers of differing species in England plant breeders began to immediately develop hybrids to develop more desirable garden plants. The first recorded experiments date to 1825 as F. arborescens \\\\u03a7 F. macrostemma and F. arborescens X F. coccinea where the quality of the resultant plants was unrecorded.\\\\nBetween 1835 and 1850 there was a tremendous influx to England of both hybrids and varieties, the majority of which have been lost.\\\\nIn 1848 Felix Porcher published the second edition of his book Le Fuchsia son Histoire et sa Culture. This described 520 cultivars. In 1871 in later editions of M. Porchers book reference is made to James Lye who was to become famous as a breeder of fuchsias in England. In 1883 the first book of English fuchsias was published.\\\\nBetween 1900 and 1914 many of the famous cultivated varieties were produced which were grown extensively for Covent Garden market by many growers just outside London. During the period between the world wars, fuchsia-growing slowed as efforts were made toward crop production until after 1949, when plant and hybrid production resumed on a large scale. While still in her teens, Hinton became a household name as the author of The Outsiders, her first and most popular novel, set in Oklahoma in the 1960s. She began writing it in 1965. The book was inspired by two rival gangs at her school, Will Rogers High School, the Greasers and the Socs, and her desire to empathize with the Greasers by writing from their point of view. She wrote the novel when she was 16 and it was published in 1967. Since then, the book has sold more than 14 million copies. In 2017, Viking Press stated the book sells over 500,000 copies a year.\\\\nHinton\\'s publisher suggested she use her initials instead of her feminine given names so that the very first male book reviewers would not dismiss the novel because its author was female. After the success of The Outsiders, Hinton chose to continue writing and publishing using her initials because she did not want to lose what she had made famous and to allow her to keep her private and public lives separate.\\\\nIn interviews, Hinton has said that she is a private person and an introvert who no longer does public appearances. She enjoys reading (Jane Austen, Mary Renault, and F. Scott Fitzgerald), taking classes at the local university, and horseback riding. Hinton also revealed to Vulture that she enjoys writing fan fiction.\\\\nShe resides in Tulsa, Oklahoma, with her husband David Inhofe, a software engineer she met in her freshman biology class at college. He is a cousin of former Oklahoma Senator Jim Inhofe.\\\\nQuestion: Were both Gabriela Mistral and G. K. Chesterton authors?\\\\nAnswer:\"}]}'}"}, "time": 1715190153.1438334}
s = ast.literal_eval(ast.literal_eval(d['input_args']["**"])['stringify_request'])['messages']

In [94]:
ast.literal_eval(ast.literal_eval(d['input_args']["**"])['stringify_request'])

{'temperature': 0.0,
 'max_tokens': 1000,
 'top_p': 1,
 'frequency_penalty': 0,
 'presence_penalty': 0,
 'n': 1,
 'model': 'gpt-3.5-turbo-0125',
 'messages': [{'role': 'user',
   'content': "Answer the factual questions based on the context.\n    Don't output anything else, just output the answer.\n    If the context does not answer the question, then say that you don't know\n\n---\n\nFollow the following format.\n\nContext: may contain relevant facts\nQuestion: question that needs to be answered\nAnswer: answer from the provided context\n\n---\n\nContext:\nWith the increasing numbers of differing species in England plant breeders began to immediately develop hybrids to develop more desirable garden plants. The first recorded experiments date to 1825 as F. arborescens Χ F. macrostemma and F. arborescens X F. coccinea where the quality of the resultant plants was unrecorded.\nBetween 1835 and 1850 there was a tremendous influx to England of both hybrids and varieties, the majority of wh

In [None]:
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}


## BATCH FILE FOR PUBMED

In [None]:
pubmed_prompt = '''
Answer questions in detail based on the context. If the context does not answer the question, then say that you don't know

---

Follow the following format.

Relevant facts: may contain relevant facts

Question: ${question}

Result: Answer to the question from either yes, no or maybe.

Answer answer in detail from the provided context

---

Relevant facts:


Question: ${question}


'''


In [104]:
from typing import List
from tqdm import tqdm
pubmed_prompt = '''
Answer the factual questions based on the context. Don't output anything else, just output the answer.
If the context does not answer the question, then say that you don't know
You will be provided with relevant facts, and you will output a json object containing the following information:

{
    answer: string // answer to the question from the provided context   
    result: string // answer to the question from either yes, no, maybe or don't know based on the context
}

'''
wikipedia_prompt = '''
Answer the factual questions based on the context.
Don't output anything else, just output the answer.
If the context does not answer the question, then say that you don't know
You will be provided with relevant facts, and you will output a json object containing the following information:

{
    answer: string // answer to the question from the provided context   
}

'''

# Creating an array of json tasks

def get_tasks(dataset_name,decoding_type,format_or_not,checkpoint,rank):
    zipped_elem = question_answer_pair(dataset_name,decoding_type,format_or_not,checkpoint,rank)
    if format_or_not:
        task_name = f"{dataset_name}-{checkpoint}-{rank}-true"
    else:
        task_name = f"{dataset_name}-{checkpoint}-{rank}-false"
    tasks = []
    if dataset_name == "pubmed":
        retriever = pubmed_bm25_retriever
        question_list = pubmed_questions
        system_prompt = pubmed_prompt
    elif dataset_name == "wikipedia":
        retriever = wiki_bm25_retriever
        system_prompt = wikipedia_prompt
        question_list = wikipedia_questions
    index = 0
    pbar = tqdm(total=len(question_list),desc=f"For {task_name}")
    for question,mod_hyde_ans in zipped_elem:
        bm25_docs = retriever.invoke(mod_hyde_ans)
        if format_or_not:
            question = question_list[index]
        context = " ".join([ctx.page_content for ctx in bm25_docs])
        
        task = {
            "custom_id": f"task-{task_name}-{index}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                # This is what you would have in your Chat Completions API call
                "model": "gpt-3.5-turbo",
                "temperature": 0.0,
                "max_tokens": 1000,
                "messages": [
                    {
                        "role": "system",
                        "content": system_prompt
                    },
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": question
                            },
                            {
                                "type": "text",
                                "text": context
                            }
                        ],
                    }
                ]            
            }
        }
        
        tasks.append(task)
        index+=1
        pbar.update(1)
    return tasks

In [105]:
tasks = get_tasks("pubmed","greedy",True,'561','32')

For pubmed-561-32-true: 100%|██████████| 1000/1000 [01:50<00:00,  9.03it/s]


In [106]:
tasks[0]

{'custom_id': 'task-pubmed-561-32-true-0',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'gpt-3.5-turbo',
  'temperature': 0.0,
  'max_tokens': 1000,
  'messages': [{'role': 'system',
    'content': "\nAnswer the factual questions based on the context. Don't output anything else, just output the answer.\nIf the context does not answer the question, then say that you don't know\nYou will be provided with relevant facts, and you will output a json object containing the following information:\n\n{\n    answer: string // answer to the question from the provided context   \n    result: string // answer to the question from either yes, no, maybe or don't know based on the context\n}\n\n"},
   {'role': 'user',
    'content': [{'type': 'text',
      'text': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?\n'},
     {'type': 'text',
      'text': 'Programmed cell death (PCD) is the regulated death of cells within an organism. Th

In [107]:
with open("batch.jsonl", 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')