# Create the chatbot

In [103]:
import os
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
     

In [104]:
from urllib.request import urlretrieve

In [105]:
# URL of the file to download
url = "https://aclanthology.org/D19-1259.pdf"

# Directory to save the file
save_directory = "Biomedical Research"

# Ensure the directory exists
os.makedirs(save_directory, exist_ok=True)

# Construct the file path
file_name = url.rpartition("/")[2]
file_path = os.path.join(save_directory, file_name)

# Download the file
urlretrieve(url, file_path)

print(f"File downloaded and saved to {file_path}")


File downloaded and saved to Biomedical Research\D19-1259.pdf


In [106]:
loader = PyPDFDirectoryLoader("./Biomedical Research/")

## Load Data

In [107]:
data = loader.load()

In [108]:
data[0]

Document(metadata={'source': 'Biomedical Research\\D19-1259.pdf', 'page': 0}, page_content='Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing\nand the 9th International Joint Conference on Natural Language Processing, pages 2567–2577,\nHong Kong, China, November 3–7, 2019.c⃝2019 Association for Computational Linguistics\n2567\nPubMedQA: A Dataset for Biomedical Research Question Answering\nQiao Jin\nUniversity of Pittsburgh\nqiao.jin@pitt.edu\nBhuwan Dhingra\nCarnegie Mellon University\nbdhingra@cs.cmu.edu\nZhengping Liu\nUniversity of Pittsburgh\nzliu@pitt.edu\nWilliam W. Cohen\nGoogle AI\nwcohen@google.com\nXinghua Lu\nUniversity of Pittsburgh\nxinghua@pitt.edu\nAbstract\nWe introduce PubMedQA, a novel biomedi-\ncal question answering (QA) dataset collected\nfrom PubMed abstracts. The task of Pub-\nMedQA is to answer research questions with\nyes/no/maybe (e.g.: Do preoperative statins\nreduce atrial ﬁbrillation after coronary artery\nbypass grafti

In [109]:
print(f'you have {len(data)} document(s) in your data')
print(f'there are {len(data[0].page_content)} characters in your document')

you have 11 document(s) in your data
there are 4397 characters in your document


## Chunk your data up into smaller documents

In [131]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size= 700, chunk_overlap=50)
texts = text_splitter.split_documents(data)

In [132]:
print(f'Now you have {len(texts)} documents')

Now you have 69 documents


In [133]:
texts[0]

Document(metadata={'source': 'Biomedical Research\\D19-1259.pdf', 'page': 0}, page_content='Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing\nand the 9th International Joint Conference on Natural Language Processing, pages 2567–2577,\nHong Kong, China, November 3–7, 2019.c⃝2019 Association for Computational Linguistics\n2567\nPubMedQA: A Dataset for Biomedical Research Question Answering\nQiao Jin\nUniversity of Pittsburgh\nqiao.jin@pitt.edu\nBhuwan Dhingra\nCarnegie Mellon University\nbdhingra@cs.cmu.edu\nZhengping Liu\nUniversity of Pittsburgh\nzliu@pitt.edu\nWilliam W. Cohen\nGoogle AI\nwcohen@google.com\nXinghua Lu\nUniversity of Pittsburgh\nxinghua@pitt.edu\nAbstract\nWe introduce PubMedQA, a novel biomedi-\ncal question answering (QA) dataset collected')

## Create embeddings of your documents to get ready for semantic search

In [134]:
embeddings = HuggingFaceBgeEmbeddings(
    model_name= "sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs = {'device' : 'cpu'},
    encode_kwargs = {'normalize_embeddings' : True}
)


In [135]:
import getpass
import os

from pinecone import Pinecone

if not os.getenv("pcsk_3uJx1J_9EmxrzDCFnF4af1GmiB1Q2HATKJv7pKWrq8E3JfvibkbSEXPVCiDPGXUqD8Fqpd"):
    os.environ["pcsk_3uJx1J_9EmxrzDCFnF4af1GmiB1Q2HATKJv7pKWrq8E3JfvibkbSEXPVCiDPGXUqD8Fqpd"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("pcsk_3uJx1J_9EmxrzDCFnF4af1GmiB1Q2HATKJv7pKWrq8E3JfvibkbSEXPVCiDPGXUqD8Fqpd")

pc = Pinecone(api_key=pinecone_api_key)

In [136]:
index_name = "chatbot"  

index = pc.Index(index_name)

## Choose the Vector Database 
#### in order to generate the embeddings and save them

In [137]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [138]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(texts))]

vector_store.add_documents(documents=texts, ids=uuids)

['d5856aa8-52f4-4a62-a4f4-8f95b8cf7cdd',
 'b19c3d9a-d9c8-4c2d-8f0a-56d64ef1dc24',
 'af2a1b88-7ce5-4321-9b13-4840dd678144',
 '25d5ecc4-06b3-41dc-a08f-f2f26e2003e7',
 '133277f8-0aa4-459b-a88b-b9765b2b61b7',
 '69e0e450-d106-4a0e-bcf8-787349417b40',
 '305eeaf8-d2ef-4d5d-baf1-3df40b0fdb96',
 'b9f32d3f-adf6-4bdf-b35e-cc1605e3d6e8',
 '7e34b80c-a611-4d96-90d2-23dd0529199e',
 '3d9748ce-6c12-4598-9c79-aac3161d9426',
 '85c4c725-ab62-4c69-9be3-b9a0d54b7be3',
 '604d8d00-4dca-4f5c-9e83-d4ff6b4dc7a9',
 '556a6f04-8b35-45b7-af33-7be94d0da222',
 '738b6c71-7e05-4cdd-ba45-88b3e4b0fc84',
 'e8b455ed-e5b0-460d-a8a7-0844e9f3cde9',
 '88456c73-d1d3-425e-9b41-65542484332a',
 '0a9fc7cf-743f-4c29-9894-cffcc5d0e1fd',
 '8a439ed8-f4c8-4039-a5d6-9fadea7a9a6f',
 'af7000d0-2b66-4246-85c3-2cc8400b9d90',
 'c4489ea7-01cf-4be6-afde-ebf8f1b3d75e',
 '9c76eb3d-a20e-4c7b-abb0-ff1b33099748',
 '566be4c2-786b-4c51-90c5-31673822b621',
 '08e9e347-aa58-4d50-89ff-cb22e9a2c146',
 '951b2dfa-e59c-4205-9ac9-d6eeea961062',
 '7baf223f-b97f-

In [139]:
results = vector_store.similarity_search(
    "Do preoperative statins reduce atrial ﬁbrillation after coronary artery bypass grafting?",
    k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* ing is to build intelligent systems that can reason
and infer over natural language. The question an-
swering (QA) task, in which models learn how to
answer questions, is often used as a benchmark for
quantitatively measuring the reasoning and infer-
ring abilities of such intelligent systems.
While many large-scale annotated general do-
main QA datasets have been introduced (Ra-
jpurkar et al., 2016; Lai et al., 2017; Ko ˇcisk`y
Question:
Do preoperative statins reduce atrial ﬁbrillation after
coronary artery bypass grafting?
Context:
(Objective) Recent studies have demonstrated that statins
have pleiotropic effects, including anti-inﬂammatory ef-
fects and atrial ﬁbrillation (AF) preventive effects [...]
(Methods) 221 patients underwent CABG in our hospital
from 2004 to 2007. 14 patients with preoperative AF and
4 patients with concomitant valve surgery [...]
(Results) The overall incidence of postoperative AF was
26%. Postoperative AF was signiﬁcantly lower in the [{'page': 0.0, '

In [140]:
print(results[0].page_content)

ing is to build intelligent systems that can reason
and infer over natural language. The question an-
swering (QA) task, in which models learn how to
answer questions, is often used as a benchmark for
quantitatively measuring the reasoning and infer-
ring abilities of such intelligent systems.
While many large-scale annotated general do-
main QA datasets have been introduced (Ra-
jpurkar et al., 2016; Lai et al., 2017; Ko ˇcisk`y
Question:
Do preoperative statins reduce atrial ﬁbrillation after
coronary artery bypass grafting?
Context:
(Objective) Recent studies have demonstrated that statins
have pleiotropic effects, including anti-inﬂammatory ef-
fects and atrial ﬁbrillation (AF) preventive effects [...]
(Methods) 221 patients underwent CABG in our hospital
from 2004 to 2007. 14 patients with preoperative AF and
4 patients with concomitant valve surgery [...]
(Results) The overall incidence of postoperative AF was
26%. Postoperative AF was signiﬁcantly lower in the


In [141]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 1, "score_threshold": 0.5},
)

query = "Advancements in CRISPR technology have opened new avenues for targeted gene editing, offering promising potential for treating genetic disorders."
retriever.invoke(query)

[Document(id='485f6536-e41f-4f56-bb7e-9af909cc458a', metadata={'page': 4.0, 'source': 'Biomedical Research\\D19-1259.pdf'}, page_content='We ﬁne-tune BioBERT (Lee et al., 2019) on Pub-\nMedQA as a baseline. BioBERT is initialized\nwith BERT (Devlin et al., 2018) and further pre-\ntrained on PubMed abstracts and PMC 7 articles.\nExpectedly, it vastly outperforms BERT in vari-\nous biomedical NLP tasks. We denote the original\ntransformer weights of BioBERT as θ0.\nWhile ﬁne-tuning, we feed PubMedQA ques-\ntions and contexts (or long answers), separated\n7https://www.ncbi.nlm.nih.gov/pmc/\nby the special [SEP] token, to BioBERT. The\nyes/no/maybe labels are predicted using the spe-\ncial [CLS] embedding using a softmax function.\nCross-entropy loss of predicted and true label dis-\ntribution is denoted as LQA.')]

## LLM

In [142]:
import os

# Set the HuggingFace API token as an environment variable
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# Retrieve the environment variable to verify
hugging_face_api_key = os.environ["HUGGINGFACEHUB_API_TOKEN"]

# Validate the token (simple check) 
if hugging_face_api_key is None or not hugging_face_api_key.startswith("hf_"): 
    raise ValueError("Invalid HuggingFace API token.")

In [143]:
from langchain_community.llms import HuggingFaceHub

hf = HuggingFaceHub(
    repo_id='mistralai/Mistral-7B-v0.1',
    model_kwargs = {"temperature" : 0.1 , "max_length" : 500}
)

In [144]:
query

'Advancements in CRISPR technology have opened new avenues for targeted gene editing, offering promising potential for treating genetic disorders.'

In [145]:
output = hf.invoke(query)
print(output)

Advancements in CRISPR technology have opened new avenues for targeted gene editing, offering promising potential for treating genetic disorders. However, the use of CRISPR in humans is still in its early stages, and there are several ethical and legal considerations that must be addressed before it can be widely adopted.

One of the main ethical concerns surrounding the use of CRISPR in humans is the potential for unintended consequences. While CRISPR has the potential to correct genetic mutations that cause diseases, there is a risk that the technology could also introduce new mutations or alter the function of other genes.


In [146]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)
     

In [147]:
retrievalQA = RetrievalQA.from_chain_type(
    llm = hf, 
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs = {"prompt" : PROMPT}
)

### USE THE RAG !!! 

### Interact with Chatbot

### Create a function to interact with your chatbot:

In [148]:
result = retrievalQA.invoke({"query" : query})
print(result)

{'query': 'Advancements in CRISPR technology have opened new avenues for targeted gene editing, offering promising potential for treating genetic disorders.', 'result': 'Use the following pieces of context to answer the question at the end. Please follow the following rules:\n1. If you don\'t know the answer, don\'t try to make up an answer. Just say "I can\'t find the final answer but you may want to check the following links".\n2. If you find the answer, write the answer in a concise way with five sentences maximum.\n\nWe ﬁne-tune BioBERT (Lee et al., 2019) on Pub-\nMedQA as a baseline. BioBERT is initialized\nwith BERT (Devlin et al., 2018) and further pre-\ntrained on PubMed abstracts and PMC 7 articles.\nExpectedly, it vastly outperforms BERT in vari-\nous biomedical NLP tasks. We denote the original\ntransformer weights of BioBERT as θ0.\nWhile ﬁne-tuning, we feed PubMedQA ques-\ntions and contexts (or long answers), separated\n7https://www.ncbi.nlm.nih.gov/pmc/\nby the special [

In [149]:
print(result.keys())

dict_keys(['query', 'result', 'source_documents'])


In [150]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 1 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: Biomedical Research\D19-1259.pdf, Page: 4.0
Content: We ﬁne-tune BioBERT (Lee et al., 2019) on Pub-
MedQA as a baseline. BioBERT is initialized
with BERT (Devlin et al., 2018) and further pre-
trained on PubMed abstracts and PMC 7 articles.
Expectedly, it vastly outperforms BERT in vari-
ous biomedical NLP tasks. We denote the original
transformer weights of BioBERT as θ0.
While ﬁne-tuning, we feed PubMedQA ques-
tions and contexts (or long answers), separated
7https://www.ncbi.nlm.nih.gov/pmc/
by the special [SEP] token, to BioBERT. The
yes/no/maybe labels are predicted using the spe-
cial [CLS] embedding using a softmax function.
Cross-entropy loss of predicted and true label dis-
tribution is denoted as LQA.
------------------------------------------------------------------------------

In [151]:
# Function to chat with your model
def chat_with_model(question):
    response = retrievalQA.invoke({"query": question})
    return response

# Example interaction
user_question = "What are the methods used in the study?"
print(chat_with_model(user_question))


{'query': 'What are the methods used in the study?', 'result': 'Use the following pieces of context to answer the question at the end. Please follow the following rules:\n1. If you don\'t know the answer, don\'t try to make up an answer. Just say "I can\'t find the final answer but you may want to check the following links".\n2. If you find the answer, write the answer in a concise way with five sentences maximum.\n\nin Fig. 3. Nearly all instances are human studies\nand they cover a wide variety of topics, including\nretrospective, prospective, and cohort studies, dif-\nferent age groups, and healthcare-related subjects\nlike treatment outcome, prognosis and risk factors\nof diseases.\n6https://www.nlm.nih.gov/mesh\nQuestion TypeReasoning TypeNumber interpretationalreadyin context?\nFigure 4: Proportional relationships between corre-\nsponded question types, reasoning types, and whether\nthe text interpretations of numbers exist in contexts.\nQuestion and Reasoning Types: We sampled\n