In [1]:
import json
import pprint
import pandas as pd
import numpy as np

## medQA loading data

In [2]:
file_path = "./Datasets/MedQA/data_clean/questions/Taiwan/tw_translated_jsonl/en/train-2en.jsonl"
with open(file_path, 'r', encoding='utf-8') as f:
    medqa_taiwan_train = [json.loads(line) for line in f]
# print(medqa_taiwan_train)


In [5]:
len(medqa_taiwan_train)

11298

In [3]:
medqa_taiwan_train_pd = pd.DataFrame(medqa_taiwan_train)

In [4]:
medqa_taiwan_train_pd.drop(columns=['meta_info', 'answer_idx','options'], inplace=True)

In [5]:
medqa_taiwan_train_pd.rename(columns={'answer':'context'}, inplace=True)

In [6]:
file_path = "./Datasets/MedQA/data_clean/questions/US/train.jsonl"
with open(file_path, 'r', encoding='utf-8') as f:
    medqa_us_train = [json.loads(line) for line in f]
# print(medqa_us_train)

In [7]:
medqa_us_train_pd = pd.DataFrame(medqa_us_train)
medqa_us_train_pd.drop(columns=['meta_info', 'answer_idx','options'], inplace=True)
medqa_us_train_pd.rename(columns={'answer':'context'}, inplace=True)

In [8]:
medqa_train_pd = pd.concat([medqa_taiwan_train_pd, medqa_us_train_pd], ignore_index=True)

In [9]:
medqa_train_pd.head()

Unnamed: 0,question,context
0,After the reaction physiology Which is not bed...,"Ligamentous laxity, increased ductility"
1,"Humans, vitamin D3 is converted to 25-hydroxyc...",liver
2,Esotropia eye when doing alternate cover test ...,Abduction (Abduction)
3,"Suppose there is a popular city Influenza A, 1...",0.2
4,Healthcare nuclear emergency physician is noti...,Emergency physicians should be according to th...


## PubMedQA loading data

In [7]:
file_path = "./Datasets/PubMedQA/ori_pqaa.json"
with open(file_path, 'r') as file:
    pmqaa = json.load(file)

print(type(pmqaa))


<class 'dict'>


In [11]:
pmqaa_keys = np.array(list(pmqaa.keys()))

In [12]:
pmqaa_values = np.array(list(pmqaa.values()))

In [13]:
pmqaa_pd = pd.DataFrame(list(pmqaa_values))

In [14]:
pmqaa_pd.drop(columns=['LABELS', 'LONG_ANSWER','MESHES', 'final_decision'], inplace=True)

In [15]:
pmqaa_pd.rename(columns={'QUESTION':'question', 'CONTEXTS':'context'}, inplace=True)

In [16]:
pmqaa_pd['context'] = pmqaa_pd['context'].apply(lambda x: x[0])

In [17]:
file_path = "./Datasets/PubMedQA/ori_pqau.json"
with open(file_path, 'r') as file:
    pmqau = json.load(file)

print(type(pmqau))

<class 'dict'>


In [18]:
pmqau_keys = np.array(list(pmqau.keys()))
pmqau_values = np.array(list(pmqau.values()))
pmqau_pd = pd.DataFrame(list(pmqau_values))

In [19]:
pmqau_pd.drop(columns=['LABELS', 'LONG_ANSWER','MESHES'], inplace=True)
pmqau_pd.rename(columns={'QUESTION':'question', 'CONTEXTS':'context'}, inplace=True)
pmqau_pd.drop(columns=['YEAR'], inplace=True)

In [20]:
pmqau_pd['context'] = pmqau_pd['context'].apply(lambda x: x[0])

In [21]:
pmqau_pd.head()

Unnamed: 0,question,context
0,Is naturopathy as effective as conventional th...,Although the use of alternative medicine in th...
1,Can randomised trials rely on existing electro...,"To estimate the feasibility, utility and resou..."
2,Is laparoscopic radical prostatectomy better t...,To compare morbidity in two groups of patients...
3,Does bacterial gastroenteritis predispose peop...,Irritable bowel syndrome (IBS) might develop a...
4,Is early colonoscopy after admission for acute...,Urgent colonoscopy has been proposed for the d...


In [22]:
pmqaa_pd.head()

Unnamed: 0,question,context
0,Are group 2 innate lymphoid cells ( ILC2s ) in...,Chronic rhinosinusitis (CRS) is a heterogeneou...
1,Does vagus nerve contribute to the development...,Phosphatidylethanolamine N-methyltransferase (...
2,Does psammaplin A induce Sirtuin 1-dependent a...,Psammaplin A (PsA) is a natural product isolat...
3,Is methylation of the FGFR2 gene associated wi...,This study examined links between DNA methylat...
4,Do tumor-infiltrating immune cell profiles and...,Tumor microenvironment immunity is associated ...


In [23]:
pmqa_train_pd = pd.concat([pmqaa_pd, pmqau_pd], ignore_index=True)

In [24]:
pmqa_train_pd.count()

question    272518
context     272518
dtype: int64

In [25]:
final_dataset = pd.concat([medqa_train_pd, pmqa_train_pd], ignore_index=True)

In [26]:
final_dataset.count()

question    293994
context     293994
dtype: int64

## MMLU(Anantomy)

In [27]:
mmlu_anatomy = {'dev': 'data/dev-00000-of-00001-388e2a72c67c7ddd.parquet', 'test': 'data/test-00000-of-00001-77adf027269dc115.parquet'}
df_test_mmlu_anatomy = pd.read_parquet("hf://datasets/joey234/mmlu-anatomy/" + mmlu_anatomy["test"])


  from .autonotebook import tqdm as notebook_tqdm


In [28]:
df_test_mmlu_anatomy.head()

Unnamed: 0,question,choices,answer,negate_openai_prompt,neg_question,fewshot_context,fewshot_context_neg
0,A lesion causing compression of the facial ner...,"[paralysis of the facial muscles., paralysis o...",0,{'content': 'Given a text in the form of a sho...,A lesion causing compression of the facial ner...,What is not the embryological origin of the hy...,What is not the embryological origin of the hy...
1,"A ""dished face"" profile is often associated with",[a protruding mandible due to reactivation of ...,1,{'content': 'Given a text in the form of a sho...,"A ""dished face"" profile is NOT often associate...",What is not the embryological origin of the hy...,What is not the embryological origin of the hy...
2,Which of the following best describes the stru...,"[Bladder, Kidney, Ureter, Urethra]",0,{'content': 'Given a text in the form of a sho...,Which of the following does not best describe ...,What is not the embryological origin of the hy...,What is not the embryological origin of the hy...
3,Which of the following structures is derived f...,"[Motor neurons, Skeletal muscles, Melanocytes,...",2,{'content': 'Given a text in the form of a sho...,Which of the following structures is not deriv...,What is not the embryological origin of the hy...,What is not the embryological origin of the hy...
4,Which of the following describes the cluster o...,"[Afferent arteriole, Glomerulus, Loop of Henle...",1,{'content': 'Given a text in the form of a sho...,Which of the following does not describe the c...,What is not the embryological origin of the hy...,What is not the embryological origin of the hy...


In [29]:
df_test_mmlu_anatomy['answer'] = df_test_mmlu_anatomy.apply(lambda row: row['choices'][row['answer']], axis=1)

In [30]:
df_test_mmlu_anatomy.drop(columns=['choices', 'negate_openai_prompt', 'neg_question', 'fewshot_context', 'fewshot_context_neg'], inplace=True)

In [31]:
df_test_mmlu_anatomy.rename(columns={'answer':'context'}, inplace=True)

## MMLU(Clinical Knowledge)

In [32]:
import pandas as pd

df_mmlu_clin_knowledge = {'validation': 'dev.json', 'test': 'test.json'}
df_mmlu_test_clin_knowledge = pd.read_json("hf://datasets/brucewlee1/mmlu-clinical-knowledge/" + df_mmlu_clin_knowledge["test"], lines=True)


In [33]:
df_mmlu_test_clin_knowledge.drop(columns=['options', 'correct_options', 'correct_options_idx'], inplace=True)

In [34]:
df_mmlu_test_clin_knowledge.rename(columns={'correct_options_literal':'context', 'centerpiece':'question'}, inplace=True)

In [35]:
df_mmlu_test_clin_knowledge['context'] = df_mmlu_test_clin_knowledge['context'].apply(lambda x: x[0])

In [36]:
df_mmlu_test_clin_knowledge.head()

Unnamed: 0,question,context
0,The key attribute in successful marathon runni...,stamina.
1,Which of the following is the commonest cause ...,Alzheimer's disease.
2,Which of the following is NOT a symptom of ana...,Bradycardia.
3,In what situation are closed pouches applied?,The patient has a colostomy.
4,With an increasing number of sprints the:,relative contribution of aerobic metabolism in...


## MMLU(College Biology)

In [37]:
import pandas as pd

mmlu_college_bio = {'validation': 'dev.json', 'test': 'test.json'}
mmlu_test_college_bio = pd.read_json("hf://datasets/brucewlee1/mmlu-college-biology/" + mmlu_college_bio["test"], lines=True)

In [38]:
mmlu_test_college_bio.drop(columns=['options', 'correct_options', 'correct_options_idx'], inplace=True)
mmlu_test_college_bio.rename(columns={'correct_options_literal':'context', 'centerpiece':'question'}, inplace=True)
mmlu_test_college_bio['context'] = mmlu_test_college_bio['context'].apply(lambda x: x[0])

## MMLU(College Medicine)

In [39]:
import pandas as pd

mmlu_college_med = {'validation': 'dev.json', 'test': 'test.json'}
mmlu_test_college_med = pd.read_json("hf://datasets/brucewlee1/mmlu-college-medicine/" + mmlu_college_med["test"], lines=True)

In [40]:
mmlu_test_college_med.drop(columns=['options', 'correct_options', 'correct_options_idx'], inplace=True)
mmlu_test_college_med.rename(columns={'correct_options_literal':'context', 'centerpiece':'question'}, inplace=True)
mmlu_test_college_med['context'] = mmlu_test_college_med['context'].apply(lambda x: x[0])

## MMLU(Medical Genetics)

In [41]:
import pandas as pd

mmlu_medgen = {'validation': 'dev.json', 'test': 'test.json'}
mmlu_test_medgen = pd.read_json("hf://datasets/brucewlee1/mmlu-medical-genetics/" + mmlu_medgen["test"], lines=True)

In [42]:
mmlu_test_medgen.drop(columns=['options', 'correct_options', 'correct_options_idx'], inplace=True)
mmlu_test_medgen.rename(columns={'correct_options_literal':'context', 'centerpiece':'question'}, inplace=True)
mmlu_test_medgen['context'] = mmlu_test_medgen['context'].apply(lambda x: x[0])

## MMLU(Professional Medicines)

In [43]:
import pandas as pd

mmlu_profmed = {'validation': 'dev.json', 'test': 'test.json'}
mmlu_test_profmed = pd.read_json("hf://datasets/brucewlee1/mmlu-professional-medicine/" + mmlu_profmed["test"], lines=True)

In [44]:
mmlu_test_profmed.drop(columns=['options', 'correct_options', 'correct_options_idx'], inplace=True)
mmlu_test_profmed.rename(columns={'correct_options_literal':'context', 'centerpiece':'question'}, inplace=True)
mmlu_test_profmed['context'] = mmlu_test_profmed['context'].apply(lambda x: x[0])

# Merge All MMLUs

In [45]:
mmlu_final_dataset = pd.concat([df_test_mmlu_anatomy,df_mmlu_test_clin_knowledge, mmlu_test_college_bio, mmlu_test_college_med, mmlu_test_medgen, mmlu_test_profmed], ignore_index=True)

In [46]:
mmlu_final_dataset.count()

question    1084
context     1084
dtype: int64

In [47]:
final_dataset = pd.concat([mmlu_final_dataset,final_dataset], ignore_index=True)

In [48]:
final_dataset.count()

question    295078
context     295078
dtype: int64

In [49]:
from IPython.display import clear_output

In [50]:
import uuid
import chromadb
chroma_client  = chromadb.PersistentClient("vectorstore_medllm")
collection = chroma_client.get_or_create_collection(name = "medllm")
if not collection.count():
    for _, row in final_dataset.iterrows():
        print(_)
        collection.add(documents=row['question'], metadatas={"context": row["context"]}, ids=[str(uuid.uuid4())])
        clear_output(wait=True)
# all-MiniLM-L6-v2

99


: 

In [50]:
import os
import time
from IPython.display import clear_output

In [63]:
for _, row in final_dataset.iterrows():
    print(_)
    clear_output(wait=True)

6603


KeyboardInterrupt: 

## Testing part

In [2]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import pickle

  from tqdm.autonotebook import tqdm, trange


In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')



In [4]:
import pprint

In [5]:
index = faiss.read_index('./Medllm_vector/faiss_index.index')
with open('./Medllm_vector/embeddings.pkl', 'rb') as f:
    embeddings_np = pickle.load(f)
df = pd.read_pickle('./Medllm_vector/dataframe.pkl')

query = "What happen in Rabies?"
query_embedding = model.encode(query, convert_to_tensor=True)
instruction = ""

query_embedding_np = query_embedding.cpu().detach().numpy().reshape(1, -1)

k = 3
D, I = index.search(query_embedding_np, k)  

# Display results
for i in range(k):
    pprint.pprint(f"Nearest neighbor {i + 1}:")
    pprint.pprint(f"Question: {df.iloc[I[0][i]]['question']}")
    pprint.pprint(f"Context: {df.iloc[I[0][i]]['context']}")
    pprint.pprint(f"Distance: {D[0][i]}")

'Nearest neighbor 1:'
'Question: Rabies exposure in international travelers: do we miss the target?'
('Context: Little data exist about the spatial distribution of the risk for '
 'travelers of being injured by a potentially rabid animal.')
'Distance: 0.6254146099090576'
'Nearest neighbor 2:'
('Question: Does the changing rate of suspected rabies bite after begin to act '
 'animal shelter in erzurum city?')
('Context: We aimed to evaluate the relationship between establishing an '
 'animal shelter in Erzurum and the number of suspected rabies bites between '
 'the years 2005 and 2012.')
'Distance: 0.6738754510879517'
'Nearest neighbor 3:'
('Question: A patient comes into the hospital after being bit by a dog who he '
 'stated was “acting crazy”. The wound is open and bleeding. Animal control '
 'captured the dog and said that it was foaming at the mouth and extremely '
 'aggressive. Suspecting a rabies infection, the patient is given a serum that '
 'contains rabies antibodies that wer

In [6]:
from langchain_groq import ChatGroq
llm = ChatGroq(
    model_name="llama-3.1-70b-versatile",
    temperature=0,
    groq_api_key = "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"
)


In [7]:
from langchain_core.prompts import PromptTemplate

prompt_extract = PromptTemplate.from_template(
        """
        ### Give the Following Question asked by user:
        {query}
        ### INSTRUCTION:
        {instruction}
        and the context of the document is: {document}: {context}
        and the another context of the document is: {document2}: {context2}
        ### (NO PREAMBLE):    
        # Provide the valid response in one para to the user's question, based on the context of the document and the instruction given and don't say about less context given.
        """
)

chain_extract = prompt_extract | llm 
res = chain_extract.invoke(input={'instruction':instruction, 'query':query, 'context':df.iloc[I[0][0]]['context'], 'document':df.iloc[I[0][0]]['question'], 'document2':df.iloc[I[0][1]]['question'], 'context2':df.iloc[I[0][1]]['context'],})
pprint.pprint(res.content)

('In rabies, exposure to the virus occurs through the bite of a potentially '
 'infected animal, which can lead to serious health consequences if left '
 'untreated. The risk of being injured by a rabid animal varies by location, '
 'and travelers may be at risk of exposure in certain areas. Establishing '
 'animal shelters can help reduce the number of suspected rabies bites by '
 'managing the animal population and providing a safe environment for animals.')


In [40]:
print(res.content)

The presence of a specific microorganism in water can indicate remote contamination. Among the options provided, Clostridium perfringens is a spore-forming bacterium that can survive in the environment for extended periods and is often used as an indicator of fecal contamination in water sources. Its presence in water suggests that the water may have been contaminated with fecal matter from a remote source.

(A) Streptococci
