# Building a RAG application from scratch

Let's start by loading the environment variables we need to use.

In [37]:
%pip install langchain langchain_core langchain_community jq pypdf bs4 pandas numpy matplotlib seaborn chromadb pinecone-client ragas

Collecting protobuf (from onnxruntime>=1.14.1->chromadb)
  Using cached protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Using cached protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.27.0
    Uninstalling protobuf-5.27.0:
      Successfully uninstalled protobuf-5.27.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-tools 1.64.0 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 4.25.3 which is incompatible.[0m[31m
[0mSuccessfully installed protobuf-4.25.3
Note: you may need to restart the kernel to use updated packages.


### Necessary run

In [38]:
import os

For this example, we'll use a simple `StrOutputParser` to extract the answer as a string.

In [39]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

In [40]:
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="nomic-embed-text")

We want to provide the model with some context and the question. [Prompt templates](https://python.langchain.com/docs/modules/model_io/prompts/quick_start) are a simple way to define and reuse prompts.

In [41]:
from langchain.prompts import ChatPromptTemplate

template = """
You are an AI assistant, trained to provide understandable and accurate information about pharmacogenomics and drugs.
You will base your responses on the context and information provided. Output both your answer and a score of how confident you are,
 and also cite the references. Also provide the source of the chunks of the documents used for response.
If the information related to the question is not in the context and or in the information provided in the prompt, 
you will say 'I don't know'.
You are not a healthcare provider and you will not provide medical care or make assumptions about treatment.


Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

### Vectorstore

Let's start by loading the transcription in memory:

In [42]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import JSONLoader
from langchain_community.document_loaders.csv_loader import CSVLoader


folder_path = "/home/dhanushb/Wellytics/RAG_data/all_files"
jsondata = []
csvdata = []
pdfdocs = []
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(folder_path, filename)
        loader = PyPDFLoader(file_path)
        doc = loader.load()
        pdfdocs.extend(doc)
    elif filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        loader = CSVLoader(file_path)
        data = loader.load()
        csvdata.extend(data)
    elif filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        loader = JSONLoader(file_path, jq_schema=".",json_lines=False,text_content=False)
        data = loader.load()
        jsondata.extend(data)

In [43]:
for doc in pdfdocs:
    doc.page_content = doc.page_content.replace('\t', ' ')
pdfdocs

[Document(page_content='HLA-B*5701: abacavir\n2356\n48% of the HLA-B*5701-positive patients develop a severe and potentially life-threatening hypersensitivity reaction to abacavir.\nAbacavir is contra-indicated for HLA-B*5701-positive patients.\navoid abacavir\nLiterature:\n1. \nSousa-Pinto B et al. Pharmacogenetics of abacavir hypersensitivity: a systematic review and meta-analysis of the association with HLA-B*57:01. J Allergy Clin Immunol\n2015;136:1092-4.e3.\n2. \nTangamornsuksan W et al. Association of HLA-B*5701 genotypes and abacavir-induced hypersensitivity reaction: a systematic review and meta-analysis. J Pharm Pharm\nSci 2015;18:68-76.\n3. \nCargnin S et al. Diagnostic accuracy of HLA-B*57:01 screening for the prediction of abacavir hypersensitivity and clinical utility of the test: a meta-analytic review.\nPharmacogenomics 2014;15:963-76.\n4. \nSaag M et al. High sensitivity of human leukocyte antigen-b*5701 as a marker for immunologically confirmed abacavir hypersensitivit

In [44]:
import pandas as pd
import numpy as np

links = pd.read_csv("/home/dhanushb/Wellytics/RAG_data/Sources.csv")

websites = links[links["html"].notna()]["html"].to_list()

from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader

webdocs = []

for website in websites:
    loader = WebBaseLoader(website)
    doc = loader.load()
    webdocs.extend(doc)

webdocs

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfa in position 31: invalid start byte

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=1000)
documents = text_splitter.split_documents(pdfdocs)
jsondocs = text_splitter.split_documents(jsondata)
#webdocuments = text_splitter.split_documents(webdocs[:8])

documents += jsondocs + csvdata 
#documents2 = documents + webdocuments

For our specific application, let's use 1000 characters instead:

In [None]:
from langchain_community.vectorstores import DocArrayInMemorySearch

#vectorstore = DocArrayInMemorySearch.from_documents(documents, embedding=embeddings)

from langchain_community.vectorstores import Qdrant

#vectorstore = Qdrant.from_documents(documents, embedding=embeddings)

from langchain_community.vectorstores import Chroma

#vectorstore = Chroma.from_documents(documents, embedding=embeddings)

In [63]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
from langchain_community.vectorstores import Pinecone

vectorstore = Pinecone.from_documents(documents, embedding=embeddings, index_name="rag-data")

In [None]:
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore(embedding=embeddings, index_name="rag-data")

In [None]:
retriever = vectorstore.as_retriever()

We can get a retriever directly from the vector store we created before: 

We can create a map with the two inputs by using the [`RunnableParallel`](https://python.langchain.com/docs/expression_language/how_to/map) and [`RunnablePassthrough`](https://python.langchain.com/docs/expression_language/how_to/passthrough) classes. This will allow us to pass the context and question to the prompt as a map with the keys "context" and "question."

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever, question=RunnablePassthrough())

In [None]:
import numpy as np
import pandas as pd
Questions = pd.read_csv("/home/dhanushb/Wellytics/RAG_data/Questions.csv")
Questions

Unnamed: 0,Questions,Expected_response,Expected_source
0,"As part of my liver transplant, I take tacroli...",For a CYP3A5 extensive metabolizer. Increase s...,https://www.fda.gov/drugs/science-and-research...
1,"I'm Indian, and my family has a history of epi...",As per CPIC HLA-B*15:02 might be observed in 2...,https://www.pharmgkb.org/haplotype/PA165954769...
2,Allopurinol is what I take for gout. For my se...,There can be moderate interaction while consid...,https://insilico-cyp.charite.de/SuperCYPsPred/...
3,"Is it true that while taking clopidogrel, I sh...","Indeed, you should refrain from drinking grape...","Bailey DG, Malcolm J, Arnold O, Spence JD. Gra..."
4,Dizziness is one of the adverse reactions that...,"Dizziness drawiness,extreme thirst,fast, shall...",https://www.fda.gov/drugs/science-and-research...
...,...,...,...
167,What are the pharmacogenomic panel advised to ...,"G6PD, CYP2D6, UGT1A1",
168,I have a 7 year old patient with HLA-B1502 pos...,Sodium Valproate,
169,How does CYP2C9 affect anticoagulants mode of ...,,https://www.pharmgkb.org/disease/PA166123298/l...
170,Which are the antidiabetic medications influen...,,https://www.pharmgkb.org/chemical/PA164712711/...


In [None]:
"""
quests = Questions[Questions["Expected_response"].notna()]
quests.to_csv("/home/dhanushb/Wellytics/RAG_data/results/quests.csv")
quests
"""

Unnamed: 0,Questions,Expected_response,Expected_source
0,"As part of my liver transplant, I take tacroli...",For a CYP3A5 extensive metabolizer. Increase s...,https://www.fda.gov/drugs/science-and-research...
1,"I'm Indian, and my family has a history of epi...",As per CPIC HLA-B*15:02 might be observed in 2...,https://www.pharmgkb.org/haplotype/PA165954769...
2,Allopurinol is what I take for gout. For my se...,There can be moderate interaction while consid...,https://insilico-cyp.charite.de/SuperCYPsPred/...
3,"Is it true that while taking clopidogrel, I sh...","Indeed, you should refrain from drinking grape...","Bailey DG, Malcolm J, Arnold O, Spence JD. Gra..."
4,Dizziness is one of the adverse reactions that...,"Dizziness drawiness,extreme thirst,fast, shall...",https://www.fda.gov/drugs/science-and-research...
...,...,...,...
149,My patient has CYP2D6 poor metabolism and does...,CPIC guideline for ondansetron and CYP2D6 has ...,
166,What are side effects associated wtih DPYD poo...,5-FU has many adverse drug effects when used s...,
167,What are the pharmacogenomic panel advised to ...,"G6PD, CYP2D6, UGT1A1",
168,I have a 7 year old patient with HLA-B1502 pos...,Sodium Valproate,


In [None]:
from langchain_community.llms import Ollama

MODELS = ["mistral", "gemma", "llama2", "llama3"] # also mixtral

questions = quests["Questions"].to_list()[96:]

resps = pd.read_csv("/home/dhanushb/Wellytics/RAG_data/results/resps.csv")

for question in questions:
    resp_dict = {}
    for MODEL in MODELS:
        model = Ollama(model=MODEL)
        chain = setup | prompt | model | parser
        response = chain.invoke(question)
        context = [docs.page_content for docs in retriever.invoke(question)]
        # Add response and context to the dataframe immediately after they are generated
        resp_dict[MODEL+"_resp"] = [response]
        resp_dict[MODEL+"_cont"] = [context]
    resps_row = pd.DataFrame(resp_dict)
    resps = pd.concat([resps, resps_row], ignore_index=True)
    resps.to_csv("/home/dhanushb/Wellytics/RAG_data/results/resps.csv")

In [None]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics.critique import harmfulness
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    context_entity_recall,
    answer_similarity,
    answer_correctness,
)

# To dict
data = {
    "question": quests["Questions"].to_list(),
    "answer": quests["mistral_resp"].to_list(),
    "contexts": quests["context"].to_list(),
    "ground_truth": quests["Expected_response"].to_list()
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_entity_recall,
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
        answer_similarity,
        answer_correctness,
        harmfulness
    ],
)
df = result.to_pandas()

df


ArrowTypeError: Expected bytes, got a 'float' object

In [None]:
"""
na_quests = Questions[Questions["Expected_response"].isna()]
na_quests.to_csv("/home/dhanushb/Wellytics/RAG_data/results/na_quests.csv")
na_quests
"""

Unnamed: 0,Questions,Expected_response,Expected_source
39,I have a family history of depression and rece...,,https://www.pharmgkb.org/genotypeResults?q=%7B...
47,I have read that metformin leads to severe kid...,,https://www.pharmgkb.org/chemical/PA450395/ove...
48,I drink socially. I recently underwent pharmac...,,
52,I have seizures from the age of 5 years. I am ...,,
54,I have allergic rhinitis and take Reswas. Phar...,,
61,I am a 62 year old female with history of 6 ye...,,
62,Why is tramadol banned in sports?,,
63,What are the antiseizure medications permissib...,,
64,Can pharmacogenomics predict response to homeo...,,
65,Is blackcurrant extract natural antidiabetic a...,,


In [None]:
questions = na_quests["Questions"].to_list()

cont = []
for question in questions:
    cont.append([docs.page_content for docs in retriever.invoke(question)])
na_quests["Context"] = cont

na_quests.to_csv("/home/dhanushb/Wellytics/RAG_data/results/na_quests.csv")

na_quests

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  na_quests["Context"] = cont


Unnamed: 0,Questions,Expected_response,Expected_source,Context
39,I have a family history of depression and rece...,,https://www.pharmgkb.org/genotypeResults?q=%7B...,[CYP2D6 PM: imipramine\n1544\nThe risk of side...
47,I have read that metformin leads to severe kid...,,https://www.pharmgkb.org/chemical/PA450395/ove...,[4. \nBatty JA et al. An investigation of CYP2...
48,I drink socially. I recently underwent pharmac...,,,[It is not known whether dose reduction in adv...
52,I have seizures from the age of 5 years. I am ...,,,[Drug: escitalopram\nGene1: CYP2C19\nPhenotype...
54,I have allergic rhinitis and take Reswas. Phar...,,,[Drug: omeprazole\nGene1: CYP2C19\nPhenotype1:...
61,I am a 62 year old female with history of 6 ye...,,,"[{""guideline"": {""id"": ""PA166262261"", ""name"": ""..."
62,Why is tramadol banned in sports?,,,[2007;55:122-30.\n9. \nSlanar O et al. Miotic ...
63,What are the antiseizure medications permissib...,,,[Drug: lansoprazole\nGene1: CYP2C19\nPhenotype...
64,Can pharmacogenomics predict response to homeo...,,,[Literature:\n1. \nAmmar H et al. Clinical and...
65,Is blackcurrant extract natural antidiabetic a...,,,[Genetic variation may lead to a decrease in t...


In [None]:
from langchain_community.llms import Ollama

MODELS = ["mistral", "gemma", "llama2", "llama3"]

n = len(questions)
for MODEL in MODELS:
    model = Ollama(model=MODEL)
    chain = setup | prompt | model | parser
    resp = []
    i = 1
    for question in questions:
        resp.append(chain.invoke(question))
        na_quests[MODEL + "_resp"] = resp + [np.nan] * (n - i)
        na_quests.to_csv("/home/dhanushb/Wellytics/RAG_data/results/na_quests.csv")
        i += 1
na_quests

ProtocolError: Failed to connect; did you specify the correct index name?

### Reference for Evaluation 
https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html
https://towardsdatascience.com/evaluating-rag-applications-with-ragas-81d67b0ee31a

# Evaluation

In [None]:
questions = quests["Questions"].to_list()[:4]
questions

['As part of my liver transplant, I take tacrolimus. My doctor recently informed me that I had a high chance of graft rejection and performed a pharmacogenetic test to determine whether my dosage needs to be adjusted. What does it indicate that I have CYP3A5 extensive metabolizer, according to my test results that I received today?',
 "I'm Indian, and my family has a history of epilepsy. My doctor is worried about the seizure event I experienced last week. My results from the genetic test she recommended show that I have an HLA-B*15:02 variation. Will this affect how I use anticonvulsants? What other options are there? Does my ethnicity play a role? Tell me where you found the information.",
 "Allopurinol is what I take for gout. For my seizures, my doctor has recommended phenytoin. I'm concerned that there might be a connection between the drugs I take for seizures and my gout. What can I do?",
 'Is it true that while taking clopidogrel, I should avoid grapefruit or grapefruit juice? 

In [None]:
ground_truths = quests["Expected_response"].to_list()[:4]
ground_truths

['For a CYP3A5 extensive metabolizer. Increase starting dose 1.5 to 2 times recommended starting dose,as lower probability of achieving target concentrations and may result in higher rejection risk. Measure drug concentrations and adjust dosage based on trough whole blood tacrolimus concentrations. ',
 'As per CPIC HLA-B*15:02 might be observed in 2.6% of central and south asian population.Here ethnicity might not play a major role wrt to the gene concerned,but HLA-B*15:02 variation,will affect the use of anticonvulsants Avoid usage of phenytoin,fosphenytoin,carbamazepine and oxcarbazepine incase of HLA-B*15:02 Positive/Heterozygous',
 'There can be moderate interaction while considering CYP2C9 with respect to allopurinol-phenytoin interaction which can be substrate-inhibitor /induced, whereas while considering HLA-B*15:02 both are contraindicated for HLA-B*15:02-Positive as it increases risk of drug-induced Stevens-Johnson syndrome/toxic epidermal necrolysis(SJS/TEN)',
 "Indeed, you s

In [None]:
from datasets import Dataset

answers = []
contexts = []

model = Ollama(model="mistral")
chain = setup | prompt | model | parser

# Inference
for query in questions:
  answers.append(chain.invoke(query))
  contexts.append([docs.page_content for docs in retriever.invoke(query)])


  warn_deprecated(


In [None]:
# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

"""
from ragas.metrics import ContextRelevancy
context_relevancy = ContextRelevancy()

results = context_relevancy.score(dataset)
results
"""

'\nfrom ragas.metrics import ContextRelevancy\ncontext_relevancy = ContextRelevancy()\n\nresults = context_relevancy.score(dataset)\nresults\n'

In [None]:
from ragas import evaluate
from ragas.metrics.critique import harmfulness
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    context_entity_recall,
    answer_similarity,
    answer_correctness,
)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
        context_entity_recall,
        answer_similarity,
        answer_correctness,
        harmfulness
    ],
)

df = result.to_pandas()
df

Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,ground_truth,context_precision,context_recall,faithfulness,answer_relevancy,context_entity_recall,answer_similarity,answer_correctness,harmfulness
0,"As part of my liver transplant, I take tacroli...","Based on your test results, you are an extens...",[recipients. Pharmacogenet Genomics 2008;18:33...,For a CYP3A5 extensive metabolizer. Increase s...,1.0,1.0,0.428571,0.872951,0.25,0.909565,0.727391,0
1,"I'm Indian, and my family has a history of epi...",The information provided suggests that geneti...,"[{""citations"": [{""id"": 15071835, ""title"": ""Rec...",As per CPIC HLA-B*15:02 might be observed in 2...,1.0,0.666667,0.923077,0.858193,0.4,0.880687,0.897601,0
2,Allopurinol is what I take for gout. For my se...,The information provided discusses the impact...,[that a stronger inhibition of the uric acid p...,There can be moderate interaction while consid...,1.0,0.0,1.0,0.0,0.571429,0.892649,0.223169,0
3,"Is it true that while taking clopidogrel, I sh...","Answer: While taking clopidogrel, it is gener...",[Drug: clopidogrel\nGene1: CYP2C19\nPhenotype1...,"Indeed, you should refrain from drinking grape...",1.0,0.5,0.0,0.891083,0.166667,0.912578,0.790659,0


### Evaluation

In [58]:
na_quests = pd.read_csv("/home/dhanushb/Wellytics/RAG_data/results/na_quests.csv")
#na_quests = na_quests.drop(["Unnamed: 0.1"],axis=1)
na_quests

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Questions,Expected_response,Expected_source,mistral_resp,gemma_resp,llama2_resp,llama3_resp
0,0,39,I have a family history of depression and rece...,,https://www.pharmgkb.org/genotypeResults?q=%7B...,Based on your CYP2D6 genotype result (*1x≥3/*...,**Implications of having CYP2D6 *1x≥3/*2x2 Ult...,Based on the information provided in the table...,Based on the provided information about pharma...
1,1,47,I have read that metformin leads to severe kid...,,https://www.pharmgkb.org/chemical/PA450395/ove...,Pharmacogenomics can provide insights into ho...,The provided text does not contain any informa...,Pharmacogenomics is the study of how genetic v...,A very interesting question!\n\nMetformin is a...
2,2,48,I drink socially. I recently underwent pharmac...,,,It seems that the information provided is abo...,The provided text does not contain any informa...,Based on the information provided in the DPWG ...,A great question!\n\nAs a friendly AI assistan...
3,3,52,I have seizures from the age of 5 years. I am ...,,,"Yes, pharmacogenomic testing could potentiall...",The provided text indicates that individuals w...,"Response: Yes, pharmacogenomic testing can hel...","A great question!\n\nIn this case, we're deali..."
4,4,54,I have allergic rhinitis and take Reswas. Phar...,,,Based on your pharmacogenomic test results (C...,The provided text does not contain any informa...,Based on the information provided in the pharm...,Thank you for reaching out with your concern! ...
5,5,61,I am a 62 year old female with history of 6 ye...,,,The information provided discusses genetic va...,The provided text does not contain any informa...,Based on the information provided in the DPWG ...,"Based on the information provided, we can anal..."
6,6,62,Why is tramadol banned in sports?,,,"Tramadol is not necessarily banned in sports,...",The provided text does not contain any informa...,Tramadol is a synthetic opioid medication that...,"Based on the provided information, tramadol is..."
7,7,63,What are the antiseizure medications permissib...,,,The provided data does not contain specific i...,The provided text does not contain any informa...,The list of antiseizure medications that are p...,I can provide information on pharmacogenomics ...
8,8,64,Can pharmacogenomics predict response to homeo...,,,Pharmacogenomics primarily studies the role o...,The provided text does not contain any informa...,Answer: There is no scientific evidence to sup...,"Based on the provided information, I'd like to..."
9,9,65,Is blackcurrant extract natural antidiabetic a...,,,The text provided does not contain informatio...,The provided text does not contain any informa...,Based on the information provided in the liter...,"Hello!\n\nAs a pharmacogenomics AI assistant, ..."


In [57]:
na_quests.to_csv("/home/dhanushb/Wellytics/RAG_data/results/na_quests.csv")

In [46]:
quests = pd.read_csv("/home/dhanushb/Wellytics/RAG_data/results/quests_with_resps.csv")
quests

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Questions,Expected_response,Expected_source,context,mistral_resp,gemma_resp,llama2_resp,llama3_resp
0,0,0,"As part of my liver transplant, I take tacroli...",For a CYP3A5 extensive metabolizer. Increase s...,https://www.fda.gov/drugs/science-and-research...,['recipients. Pharmacogenet Genomics 2008;18:3...,"Based on your test results, if you are an ext...",The provided text does not contain any informa...,Based on the information provided in the docum...,According to your pharmacogenetic test results...
1,1,1,"I'm Indian, and my family has a history of epi...",As per CPIC HLA-B*15:02 might be observed in 2...,https://www.pharmgkb.org/haplotype/PA165954769...,"['{""citations"": [{""id"": 15071835, ""title"": ""Re...",The genetic variant HLA-B*15:02 that you have...,The provided text does not contain any informa...,The information you provided is from a scienti...,I'm an AI assistant trained to provide underst...
2,2,2,Allopurinol is what I take for gout. For my se...,There can be moderate interaction while consid...,https://insilico-cyp.charite.de/SuperCYPsPred/...,['that a stronger inhibition of the uric acid ...,Answer: Phenytoin and allopurinol are two dis...,The provided text does not contain any informa...,It is understandable to have concerns about po...,A great question!\n\nAs an AI assistant traine...
3,3,3,"Is it true that while taking clopidogrel, I sh...","Indeed, you should refrain from drinking grape...","Bailey DG, Malcolm J, Arnold O, Spence JD. Gra...",['Drug: clopidogrel\nGene1: CYP2C19\nPhenotype...,"While taking clopidogrel, it is generally rec...",The provided text does not contain information...,"Yes, it is true that clopidogrel can interact ...",A pharmacogenomics enthusiast!\n\nAccording to...
4,4,4,Dizziness is one of the adverse reactions that...,"Dizziness drawiness,extreme thirst,fast, shall...",https://www.fda.gov/drugs/science-and-research...,"['1. \nGanoci L et al. ABCB1, ABCG2 and CYP2D6...","Based on the literature provided, there is ev...",The provided text does not contain any informa...,The information provided in the response is ba...,I can help you understand the relationship bet...
...,...,...,...,...,...,...,...,...,...,...
129,129,149,My patient has CYP2D6 poor metabolism and does...,CPIC guideline for ondansetron and CYP2D6 has ...,,"[""Drug: sertraline\nGene1: CYP2C19\nPhenotype1...","Based on the provided data, there are no spec...",The provided text does not contain any informa...,The guidelines for prescribing sertraline for ...,The guidelines provided do not specifically ad...
130,130,166,What are side effects associated wtih DPYD poo...,5-FU has many adverse drug effects when used s...,,['Drug: lornoxicam\nGene1: CYP2C9\nPhenotype1:...,The HLAB system is not directly involved in p...,The provided text does not contain any informa...,HLA-B and CYP2C9 genetic polymorphisms are imp...,According to the provided pharmacogenomics inf...
131,131,167,What are the pharmacogenomic panel advised to ...,"G6PD, CYP2D6, UGT1A1",,"['{""citations"": [{""id"": 15143242, ""title"": ""Cl...",The given text suggests that for a patient wi...,The provided text suggests that due to the pat...,Based on the information provided in the liter...,"Based on the provided information, for a patie..."
132,132,168,I have a 7 year old patient with HLA-B1502 pos...,Sodium Valproate,,['9\n. \nKristensen MH et al. Variants in the ...,The side effects associated with DPD (dihydro...,Patients with DPYD poor metabolism of fluorour...,"Based on the provided document, the side effec...","According to the provided document, individual..."


In [47]:
quests = quests.drop(["Unnamed: 0.1"], axis=1)
quests

Unnamed: 0.1,Unnamed: 0,Questions,Expected_response,Expected_source,context,mistral_resp,gemma_resp,llama2_resp,llama3_resp
0,0,"As part of my liver transplant, I take tacroli...",For a CYP3A5 extensive metabolizer. Increase s...,https://www.fda.gov/drugs/science-and-research...,['recipients. Pharmacogenet Genomics 2008;18:3...,"Based on your test results, if you are an ext...",The provided text does not contain any informa...,Based on the information provided in the docum...,According to your pharmacogenetic test results...
1,1,"I'm Indian, and my family has a history of epi...",As per CPIC HLA-B*15:02 might be observed in 2...,https://www.pharmgkb.org/haplotype/PA165954769...,"['{""citations"": [{""id"": 15071835, ""title"": ""Re...",The genetic variant HLA-B*15:02 that you have...,The provided text does not contain any informa...,The information you provided is from a scienti...,I'm an AI assistant trained to provide underst...
2,2,Allopurinol is what I take for gout. For my se...,There can be moderate interaction while consid...,https://insilico-cyp.charite.de/SuperCYPsPred/...,['that a stronger inhibition of the uric acid ...,Answer: Phenytoin and allopurinol are two dis...,The provided text does not contain any informa...,It is understandable to have concerns about po...,A great question!\n\nAs an AI assistant traine...
3,3,"Is it true that while taking clopidogrel, I sh...","Indeed, you should refrain from drinking grape...","Bailey DG, Malcolm J, Arnold O, Spence JD. Gra...",['Drug: clopidogrel\nGene1: CYP2C19\nPhenotype...,"While taking clopidogrel, it is generally rec...",The provided text does not contain information...,"Yes, it is true that clopidogrel can interact ...",A pharmacogenomics enthusiast!\n\nAccording to...
4,4,Dizziness is one of the adverse reactions that...,"Dizziness drawiness,extreme thirst,fast, shall...",https://www.fda.gov/drugs/science-and-research...,"['1. \nGanoci L et al. ABCB1, ABCG2 and CYP2D6...","Based on the literature provided, there is ev...",The provided text does not contain any informa...,The information provided in the response is ba...,I can help you understand the relationship bet...
...,...,...,...,...,...,...,...,...,...
129,149,My patient has CYP2D6 poor metabolism and does...,CPIC guideline for ondansetron and CYP2D6 has ...,,"[""Drug: sertraline\nGene1: CYP2C19\nPhenotype1...","Based on the provided data, there are no spec...",The provided text does not contain any informa...,The guidelines for prescribing sertraline for ...,The guidelines provided do not specifically ad...
130,166,What are side effects associated wtih DPYD poo...,5-FU has many adverse drug effects when used s...,,['Drug: lornoxicam\nGene1: CYP2C9\nPhenotype1:...,The HLAB system is not directly involved in p...,The provided text does not contain any informa...,HLA-B and CYP2C9 genetic polymorphisms are imp...,According to the provided pharmacogenomics inf...
131,167,What are the pharmacogenomic panel advised to ...,"G6PD, CYP2D6, UGT1A1",,"['{""citations"": [{""id"": 15143242, ""title"": ""Cl...",The given text suggests that for a patient wi...,The provided text suggests that due to the pat...,Based on the information provided in the liter...,"Based on the provided information, for a patie..."
132,168,I have a 7 year old patient with HLA-B1502 pos...,Sodium Valproate,,['9\n. \nKristensen MH et al. Variants in the ...,The side effects associated with DPD (dihydro...,Patients with DPYD poor metabolism of fluorour...,"Based on the provided document, the side effec...","According to the provided document, individual..."


In [64]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics.critique import harmfulness
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    context_entity_recall,
    answer_similarity,
    answer_correctness,
)

# To dict
data = {
    "question": list(map(str, quests["Questions"].to_list())),
    "answer": list(map(str, quests["mistral_resp"].to_list())),
    "contexts": [[str(context)] for context in quests["context"].to_list()],
    "ground_truth": list(map(str, quests["Expected_response"].to_list()))
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_entity_recall,
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
        answer_similarity,
        answer_correctness,
        harmfulness
    ],
)
df = result.to_pandas()
df

Evaluating:   0%|          | 0/1072 [00:00<?, ?it/s]

Exception in thread Thread-90:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/dhanushb/Wellytics/AIdemo/lib/python3.10/site-packages/ragas/executor.py", line 96, in run
    results = self.loop.run_until_complete(self._aresults())
  File "/usr/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
    return future.result()
  File "/home/dhanushb/Wellytics/AIdemo/lib/python3.10/site-packages/ragas/executor.py", line 84, in _aresults
    raise e
  File "/home/dhanushb/Wellytics/AIdemo/lib/python3.10/site-packages/ragas/executor.py", line 79, in _aresults
    r = await future
  File "/usr/lib/python3.10/asyncio/tasks.py", line 571, in _wait_for_one
    return f.result()  # May raise f.exception().
  File "/home/dhanushb/Wellytics/AIdemo/lib/python3.10/site-packages/ragas/executor.py", line 38, in sema_coro
    return await coro
  File "/home/dhanushb/Wellytics/AIdemo/lib/python3.10/

ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.