In [1]:
import pandas as pd
from llama_index.core import Document, SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.settings import Settings
from llama_index.core.chat_engine import CondenseQuestionChatEngine
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.llms.llama_cpp import LlamaCPP
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding    
import faiss
import os
import json
import re
from Bio import Entrez
import time
import xmltodict
from pathlib import Path
from ragas import evaluate
from ragas.metrics import answer_correctness, faithfulness, context_recall, AnswerCorrectness
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


### Initial setup

In [None]:
training_dataset_path = r"E:\RAG_Models\BioASQ-training13b\training13b.json"
abstract_cache = Path(r".\Abstracts")
email = "an80@illinois.edu"

In [21]:
abstract_cache.mkdir(exist_ok=True)

In [14]:
Entrez.email = email

In [68]:
with open(training_dataset_path,"r") as file:
    bioasq_json_data = json.load(file)

In [119]:
document_urls = []
questions = []
ground_truth = []
for data in bioasq_json_data['questions'][:100]:
   document_urls.extend(data.get('documents'))
   question = data.get("body")
   questions.append(question)
   ground_truth.append(data.get("ideal_answer"))

In [120]:
print(f'Checking for duplicate documents. Original number of documents: {len(document_urls)}')
document_urls = list(set(document_urls))
print(f'Number of documents after filtering: {len(document_urls)}')

Checking for duplicate documents. Original number of documents: 1142
Number of documents after filtering: 1136


In [121]:
print(f'Number of questions: {len(questions)}')
print(f'Number of ground truths: {len(ground_truth)}')

Number of questions: 100
Number of ground truths: 100


In [123]:
bioasq_df = pd.DataFrame(columns=['Question','Ground Truth'])

In [124]:
bioasq_df['Question'] = questions
bioasq_df['Ground Truth'] = ground_truth

In [125]:
bioasq_df.head()

Unnamed: 0,Question,Ground Truth
0,Is Hirschsprung disease a mendelian or a multi...,"[Coding sequence mutations in RET, GDNF, EDNRB..."
1,List signaling molecules (ligands) that intera...,[The 7 known EGFR ligands are: epidermal grow...
2,Is the protein Papilin secreted?,"[Yes, papilin is a secreted protein]"
3,Are long non coding RNAs spliced?,[Long non coding RNAs appear to be spliced thr...
4,Is RANKL secreted from the cells?,[Receptor activator of nuclear factor κB ligan...


In [126]:
bioasq_df.to_csv("bioasq_ground_truth.csv", index=False)

In [10]:
PMID_RE = re.compile(r"/pubmed/(\d+)")

In [11]:
def pmid_from_url(url: str) -> str:
    m = PMID_RE.search(url)
    return m.group(1) if m else None

In [12]:
pmids = [pmid_from_url(doc) for doc in document_urls]

In [None]:
def batch_fetch_pmids(pmids, batch = 200):
    """Return {pmid: {'title','abstract','year'}}; caches and skips empty abstracts."""
    out = {}
    for i in range(0, len(pmids), batch):
        chunk = pmids[i:i + batch]
        need = [p for p in chunk if not (abstract_cache / f"{p}.json").exists()]
        if need:
            raw_xml = Entrez.efetch(
                db="pubmed",
                id=",".join(need),
                rettype="abstract",
                retmode="xml"
            ).read()
            xml = xmltodict.parse(raw_xml)
            for art in xml["PubmedArticleSet"]["PubmedArticle"]:
                pmid = art["MedlineCitation"]["PMID"]["#text"]
                art_info = art["MedlineCitation"]["Article"]
                title = art_info.get("ArticleTitle", "")

                # ---- robust abstract extraction ----
                abs_raw = art_info.get("Abstract", {}).get("AbstractText", [])
                if isinstance(abs_raw, list):
                    parts = [x.get("#text", "") if isinstance(x, dict) else str(x) for x in abs_raw]
                    abstract = " ".join(parts).strip()
                else:
                    abstract = str(abs_raw).strip()

                if not abstract:   # skip if empty
                    continue

                year = art_info["Journal"]["JournalIssue"]["PubDate"].get("Year", "Unknown")

                meta = {"title": title, "abstract": abstract, "year": year}
                (abstract_cache / f"{pmid}.json").write_text(json.dumps(meta))

        # load all cached (existing + newly saved)
        for pmid in chunk:
            fp = abstract_cache / f"{pmid}.json"
            if fp.exists():
                meta = json.loads(fp.read_text())
                if meta.get("abstract", "").strip():
                    out[pmid] = meta
        time.sleep(0.4)  # politeness
    return out

In [24]:
data_map = batch_fetch_pmids(pmids)

In [30]:
source_directory = r".\Abstracts"
out_csv = "pubmed_abstracts.csv"

In [42]:
abstracts_df = pd.DataFrame(columns=["title", "abstract", "year"])

In [43]:
for file in os.listdir(source_directory):
    file_path = os.path.join(source_directory,file)
    with open(file_path, 'r') as f:
        try:
            json_data = json.loads(f.read())
            title = json_data.get('title')   
            abstract = json_data.get('abstract')
            year = json_data.get('year')
            abstracts_df.loc[len(abstracts_df)] = [title, abstract, year]     
        except Exception as e:
            print(f"Error occurred while processing file {file}: {e}")

### Testing the model - No need to execute previous cells - Start by loading the csv files directly

In [2]:
def dataframe_to_documents(df):
    docs = []
    for _, row in df.iterrows():
        text = row["abstract"]
        metadata = {
            "title": row.get("title", ""),
            "year": row.get("year", ""),
        }
        docs.append(Document(text=text, metadata=metadata))
    return docs

In [48]:
target_directory = os.path.join(os.getcwd(), out_csv)
abstracts_df.to_csv(target_directory, index=False)

In [3]:
bioasq_df = pd.read_csv('bioasq_ground_truth.csv')

In [4]:
abstracts_df = pd.read_csv('pubmed_abstracts.csv')

In [5]:
abstracts_df = abstracts_df.dropna(subset='abstract')

In [6]:
documents = dataframe_to_documents(abstracts_df)

In [7]:
model_directory = r"E:\RAG_Models"
model_name = "phi-2-orange.Q4_K_M.gguf"

model_path = model_directory+"\\"+model_name

In [8]:
llm = LlamaCPP(model_path = model_path, temperature = 0.2, max_new_tokens = 256, context_window = 2048, verbose=True)

llama_model_loader: loaded meta data with 21 key-value pairs and 325 tensors from E:\RAG_Models\phi-2-orange.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi2
llama_model_loader: - kv   1:                               general.name str              = Phi2
llama_model_loader: - kv   2:                        phi2.context_length u32              = 2048
llama_model_loader: - kv   3:                      phi2.embedding_length u32              = 2560
llama_model_loader: - kv   4:                   phi2.feed_forward_length u32              = 10240
llama_model_loader: - kv   5:                           phi2.block_count u32              = 32
llama_model_loader: - kv   6:                  phi2.attention.head_count u32              = 32
llama_model_loader: - kv   7:               phi2.attention.head_count_kv u32  

In [9]:
Settings.llm = llm

In [10]:
Settings.embed_model = HuggingFaceEmbedding(model_name = "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")

In [11]:
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50)
Settings.node_parser = text_splitter

In [12]:
dimensions = len(Settings.embed_model.get_text_embedding("sample text"))

faiss_index = faiss.IndexFlatL2(dimensions)
faiss_db = FaissVectorStore(faiss_index = faiss_index)
storage_context = StorageContext.from_defaults(vector_store = faiss_db)

In [13]:
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

In [14]:
memory = ChatMemoryBuffer.from_defaults(token_limit = 600)
query_engine = index.as_query_engine(similarity_top_k = 3)

In [15]:
chat_engine = CondenseQuestionChatEngine.from_defaults(
    query_engine = query_engine,
    memory = memory
)

In [16]:
sample_questions = bioasq_df['Question'][:3]
sample_gt = bioasq_df['Ground Truth'][:3]

In [17]:
def get_answer_context(query):
    response = chat_engine.chat(query)
    chunks = [n.node.get_content() for n in response.source_nodes]
    context_str = "\n".join(chunks)
    
    return pd.Series({"Response": response.response, "Context": context_str})

In [18]:
test_df = bioasq_df.iloc[:5,:]

In [141]:
test_df[['Response','Context']] = test_df['Question'].apply(lambda row: get_answer_context(row))

llama_perf_context_print:        load time =   34594.90 ms
llama_perf_context_print: prompt eval time =   34580.35 ms /  1108 tokens (   31.21 ms per token,    32.04 tokens per second)
llama_perf_context_print:        eval time =   30203.08 ms /   229 runs   (  131.89 ms per token,     7.58 tokens per second)
llama_perf_context_print:       total time =   65189.10 ms /  1337 tokens
llama_perf_context_print:        load time =   34594.90 ms
llama_perf_context_print: prompt eval time =    9036.01 ms /   325 tokens (   27.80 ms per token,    35.97 tokens per second)
llama_perf_context_print:        eval time =   30680.14 ms /   255 runs   (  120.31 ms per token,     8.31 tokens per second)
llama_perf_context_print:       total time =   40062.83 ms /   580 tokens
llama_perf_context_print:        load time =   34594.90 ms
llama_perf_context_print: prompt eval time =   42594.90 ms /  1559 tokens (   27.32 ms per token,    36.60 tokens per second)
llama_perf_context_print:        eval time = 

In [142]:
test_df

Unnamed: 0,Question,Ground Truth,Response,Context
0,Is Hirschsprung disease a mendelian or a multi...,"[Coding sequence mutations in RET, GDNF, EDNRB...",\nHirschsprung disease is a multifactorial dis...,Hirschsprung's disease (HSCR) is a fairly freq...
1,List signaling molecules (ligands) that intera...,[The 7 known EGFR ligands are: epidermal grow...,\nThe signaling molecules that interact with t...,The epidermal growth factor receptor (EGFR) is...
2,Is the protein Papilin secreted?,"[Yes, papilin is a secreted protein]","\nNo, Papilin does not belong to the family of...",A sulfated glycoprotein was isolated from the ...
3,Are long non coding RNAs spliced?,[Long non coding RNAs appear to be spliced thr...,"\n\nNo, long non-coding RNAs (lncRNAs) do not ...",Long non-coding RNAs (lncRNAs) resemble protei...
4,Is RANKL secreted from the cells?,[Receptor activator of nuclear factor κB ligan...,\nRANKL (RANK ligand) is a transmembrane prote...,Receptor activator of nuclear factor-kappaB-li...


In [153]:
def clean_responses(response):
    response = response.replace("\n","")
    return response

In [154]:
test_df.loc[:,'Response'] = test_df['Response'].apply(lambda row: clean_responses(row)) 

In [172]:
rag_df = test_df.rename(columns={
    "Question": "question",
    "Context": "retrieved_contexts",
    "Response": "answer",
    "Ground Truth": "ground_truth"
})

In [173]:
rag_df["ground_truth"] = rag_df["ground_truth"].apply(
    lambda x: " ".join(x) if isinstance(x, (list, tuple)) else str(x)
)

In [None]:
rag_df["retrieved_contexts"] = rag_df["retrieved_contexts"].apply(lambda x: [x]) 

In [178]:
rag_df.to_csv('BioASQ_Test_Responses.csv')

In [18]:
rag_df = pd.read_csv('BioASQ_Test_Responses.csv')

In [19]:
import ast

In [20]:
def to_list(val):
    # if it's already a list, pass through; if it's a str, parse it
    if isinstance(val, list):
        return val
    return ast.literal_eval(val) 

In [21]:
rag_df["retrieved_contexts"] = rag_df["retrieved_contexts"].apply(to_list)

In [22]:
rag_df[["question", "retrieved_contexts", "answer", "ground_truth"]]

Unnamed: 0,question,retrieved_contexts,answer,ground_truth
0,Is Hirschsprung disease a mendelian or a multi...,[Hirschsprung's disease (HSCR) is a fairly fre...,Hirschsprung disease is a multifactorial disor...,"Coding sequence mutations in RET, GDNF, EDNRB,..."
1,List signaling molecules (ligands) that intera...,[The epidermal growth factor receptor (EGFR) i...,The signaling molecules that interact with the...,The 7 known EGFR ligands are: epidermal growt...
2,Is the protein Papilin secreted?,[A sulfated glycoprotein was isolated from the...,"No, Papilin does not belong to the family of s...","Yes, papilin is a secreted protein"
3,Are long non coding RNAs spliced?,[Long non-coding RNAs (lncRNAs) resemble prote...,"No, long non-coding RNAs (lncRNAs) do not unde...",Long non coding RNAs appear to be spliced thro...
4,Is RANKL secreted from the cells?,[Receptor activator of nuclear factor-kappaB-l...,RANKL (RANK ligand) is a transmembrane protein...,Receptor activator of nuclear factor κB ligand...


In [23]:
rag_ds = Dataset.from_pandas(
    rag_df[["question", "retrieved_contexts", "answer", "ground_truth"]]
)

In [24]:
rag_df.loc[0,'retrieved_contexts']

["Hirschsprung's disease (HSCR) is a fairly frequent cause of intestinal obstruction in children. It is characterized as a sex-linked heterogonous disorder with variable severity and incomplete penetrance giving rise to a variable pattern of inheritance. Although Hirschsprung's disease occurs as an isolated phenotype in at least 70% of cases, it is not infrequently associated with a number of congenital abnormalities and associated syndromes, demonstrating a spectrum of congenital anomalies. Certain of these syndromic phenotypes have been linked to distinct genetic sites, indicating underlying genetic associations of the disease and probable gene-gene interaction, in its pathogenesis. These associations with HSCR include Down's syndrome and other chromosomal anomalies, Waardenburg syndrome and other Dominant sensorineural deafness, the Congenital Central Hypoventilation and Mowat-Wilson and other brain-related syndromes, as well as the MEN2 and other tumour associations. A number of ot

In [25]:
from ragas.llms import LlamaIndexLLMWrapper

In [26]:
evaluator_llm = LlamaIndexLLMWrapper(Settings.llm)

In [29]:
custom_prompt = """
Evaluate the correctness of the answer based on the ground truth. Return a JSON object with 'score' (0.0-1.0) and 'reason'.
Examples of valid outputs:
{{"score": 0.2, "reason": "Explanation..."}}

Question: {question}
Ground Truth: {ground_truth}
Answer: {answer}
"""

In [27]:
answer_correctness = AnswerCorrectness(llm = evaluator_llm)

In [33]:
answer_correctness.__dict__

{'_required_columns': {<MetricType.SINGLE_TURN: 'single_turn'>: {'reference',
   'response',
   'user_input'}},
 'name': 'answer_correctness',
 'embeddings': HuggingFaceEmbedding(model_name='pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000001F08E1A9310>, num_workers=None, max_length=100, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False),
 'llm': LlamaIndexLLMWrapper(llm=LlamaCPP(...)),
 'output_type': None,
 'correctness_prompt': CorrectnessClassifier(instruction=
 Evaluate the correctness of the answer based on the ground truth. Return a JSON object with 'score' (0.0-1.0) and 'reason'.
 Examples of valid outputs:
 {{"score": 0.2, "reason": "Explanation..."}}
 
 Question: {question}
 Ground Truth: {ground_truth}
 Answer: {answer}
 , examples=[(QuestionAnswerGroundTruth(question='What powers the sun and what is its prima

In [31]:
answer_correctness.correctness_prompt.instruction = custom_prompt

In [32]:
answer_correctness.embeddings = Settings.embed_model

In [33]:
result = evaluate(rag_ds,metrics=[answer_correctness])

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]llama_perf_context_print:        load time =   22496.49 ms
llama_perf_context_print: prompt eval time =   22485.76 ms /   626 tokens (   35.92 ms per token,    27.84 tokens per second)
llama_perf_context_print:        eval time =   30093.42 ms /   255 runs   (  118.01 ms per token,     8.47 tokens per second)
llama_perf_context_print:       total time =   53231.61 ms /   881 tokens
Llama.generate: 396 prefix-match hit, remaining 107 prompt tokens to eval
llama_perf_context_print:        load time =   22496.49 ms
llama_perf_context_print: prompt eval time =    3064.49 ms /   107 tokens (   28.64 ms per token,    34.92 tokens per second)
llama_perf_context_print:        eval time =   53704.47 ms /   255 runs   (  210.61 ms per token,     4.75 tokens per second)
llama_perf_context_print:       total time =   57776.24 ms /   362 tokens
llama_perf_context_print:        load time =   22496.49 ms
llama_perf_context_print: prompt eval time =   9

In [34]:
print(result)

{'answer_correctness': nan}
