# Install These Packages

In [2]:
# !pip install langchain
# !pip install chromadb
# !pip install pypdf
# !pip install pytest
# !pip install accelerate
# !pip install -U bitsandbytes
# %pip install -qU langchain-google-vertexai
#!pip install sacremoses
#!pip install -qU langchain-community faiss-cpu

# General Imports

In [3]:
import torch
import numpy as np

# Goggle Cloud AutoML SetUp

In [4]:
from google.oauth2 import service_account
import vertexai
import json

In [5]:
# load the JSON file
with open('./data/probable-life-441114-n1-d2f8fa3aef61.json') as source:
    info = json.load(source)

vertex_cred = service_account.Credentials.from_service_account_info(info)

PROJECT_ID = "probable-life-441114-n1"
REGION = "asia-south1"
vertexai.init(project=PROJECT_ID,
             location=REGION,
             credentials=vertex_cred)

# Load and Process Data

In [6]:
from langchain.schema.document import Document
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
DATA_PATH = "data"

# Create a function to load all the PDF files from data directory
def load_data(DATA_PATH):
    loader = PyPDFDirectoryLoader(DATA_PATH)
    return loader.load()

# call the function
data1 = load_data(DATA_PATH)  # It should be of type "langchain_core.documents.base.Document"

In [8]:
# Now split the data into chunks
def split_into_chunks(data):
    spliter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 100,
        length_function = len,
        is_separator_regex = False, # use if your separators are plain text and not regex patterns.
    )
    return spliter.split_documents(data)

chunks = split_into_chunks(data = data1)

In [9]:
chunks[0]

Document(metadata={'source': 'data/Guyton and Hall textbook of medical physiology 14th Edition Emedicodiary.pdf', 'page': 1}, page_content='NOTE TO INSTRUCTORS\nContact your Elsevier Sales Representative for teaching \nresources, including slides and image banks, for Guyton and \nHall Textbook of  Medical Physiology, 14e, or request these \nsupporting materials at:\nhttp://evolve.elsevier.com/Hall/physiology/')

# Load Embedding Model

In [31]:
# Using Langchain official docs
from langchain_google_vertexai import VertexAIEmbeddings

# Initialize the a specific Embeddings Model version
embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@003")

# VectorDB setup

## FAISS (Facebook AI Similarity Search)

In [20]:
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import faiss

In [29]:
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
vector_db = FAISS(
    embedding_function= embeddings,
    index = index,
    docstore = InMemoryDocstore(),
    index_to_docstore_id = {}
)

In [30]:
# Add items to VectorStore

from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(chunks))]
vector_db.add_documents(documents= chunks, ids = uuids)

['486a2b7b-7ee4-4881-a5a8-da42739cf4c4',
 '3c995e69-4eac-47b5-a584-b8807f9fc357',
 '638b4f38-ed4c-4605-aa5d-0661f51e4df4',
 '6b7e29d6-d306-47da-9500-6840cd429dd8',
 '9431a6f3-d0a4-43dc-be5d-c70d8cc9d692',
 'd522b507-d62e-40bd-a900-08fe8bcd0513',
 '19c6c47a-f53d-4fea-bb2b-a27194879a3e',
 '063ef586-dea1-4886-900f-07d55492f6a5',
 'ea100459-0d87-4a72-bea4-a10e08e1882b',
 '4e19fa27-5056-4c93-80d9-65edf49c8bca',
 'e7e70ec5-19fb-4845-ab48-3717f3d4b8df',
 'eb8873bf-a510-4216-8130-9ff7e8bf8479',
 '2c0c78d8-7e19-4860-b3dc-8fcbd895d9d7',
 '507801cf-c71a-4b8d-a2dc-9f9dbc7f3bcb',
 'aa93e0e7-abbc-4094-9e02-387de9574d9f',
 '91ff6453-d333-4f75-a56d-2dba6fc8d775',
 'e1131d3a-95e2-4042-9e66-10c2ecf6e8db',
 '2108935f-ce58-40db-b511-ab5dbc175a00',
 '1a8a4e1e-520c-4acb-b1f1-b9ca57e2993d',
 '03961d40-3431-4744-ae5f-ece22b628f1a',
 'b820ba03-1555-452d-9407-6ea2012ae2ea',
 'b098a470-0763-4cc2-8ea1-1293f6b54d96',
 'd7ea8830-54a0-4225-ba8f-c328c17628ff',
 '2ecb054a-ef54-42bb-b086-45e01fd82347',
 '51eaa18f-dc05-

In [None]:
# Query vector store
query_text = 
result = vector_db.similarity_search()

## Using InMemoryDB

In [8]:
texts = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]

from langchain_core.vectorstores import InMemoryVectorStore

vectorstore = InMemoryVectorStore.from_texts(
    texts,
    metadatas=metadatas,
    embedding=embeddings,
)

# Retriever

In [12]:
from langchain.prompts import ChatPromptTemplate

In [None]:

"""
Questions to test:

Who and when had written the first edition of Medical Physiology?
Why does ventricular fibrillation often occur without atrial fibrillation?
How does pulmonary emphysema affect electrocardiographic potentials?

"""

In [151]:
# Put your query here...

query_text = "How does pulmonary emphysema affect electrocardiographic potentials?"

## Use this in case of InMemoryVectorStore

In [13]:
# Search the DB and fetch the similar context
# Use the vectorstore as a retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Retrieve the most similar text
retrieved_documents = retriever.invoke(query_text)  # of type list

# show the retrieved document's content
print(retrieved_documents[0].page_content)

vii
The first edition of the Textbook of Medical Physiology was 
written by Arthur C. Guyton almost 65 years ago. Unlike 
most major medical textbooks, which often have 20 or 
more authors, the first eight editions of the Textbook of 
Medical Physiology were written entirely by Dr. Guyton. 
He had a gift for communicating complex ideas in a clear 
and interesting manner that made studying physiology 
fun. He wrote the book to help students learn physiology,


In [None]:
# Combine the content (page_content) from the retrieved documents into a single context string to be used in the prompt.
context = "\n".join([doc.page_content for doc in retrieved_documents])

## Use this in case of FAISS

In [152]:
result = vector_db.similarity_search(query = query_text)
print(result[0])
context = "\n".join([doc.page_content for doc in result])
# define reteiever
retriever = vector_db.as_retriever()

page_content='Pulmonary emphysema  can decrease the electrocar-
diographic potentials, but for a different reason than 
that of pericardial effusion. In persons with pulmonary 
emphysema, conduction of electrical current through the 
lungs is depressed considerably because of an excessive 
quantity of air in the lungs. Also, the chest cavity enlarges, 
and the lungs tend to envelop the heart to a greater extent 
than normal. Therefore, the lungs act as an insulator to' metadata={'source': 'data/Guyton and Hall textbook of medical physiology 14th Edition Emedicodiary.pdf', 'page': 144}


In [173]:
result

[Document(id='8631ee9a-13ae-4dfb-a1c1-c7d3be499da5', metadata={'source': 'data/Guyton and Hall textbook of medical physiology 14th Edition Emedicodiary.pdf', 'page': 144}, page_content='Pulmonary emphysema  can decrease the electrocar-\ndiographic potentials, but for a different reason than \nthat of pericardial effusion. In persons with pulmonary \nemphysema, conduction of electrical current through the \nlungs is depressed considerably because of an excessive \nquantity of air in the lungs. Also, the chest cavity enlarges, \nand the lungs tend to envelop the heart to a greater extent \nthan normal. Therefore, the lungs act as an insulator to'),
 Document(id='9c37ed8b-3ce5-4d22-b9bd-9db90e7a356f', metadata={'source': 'data/Guyton and Hall textbook of medical physiology 14th Edition Emedicodiary.pdf', 'page': 144}, page_content='part of the heart to another through the pericardial fluid. \nThus, this effusion effectively “short- circuits” the electrical \npotentials generated by the he

### Play here

In [34]:
len(result)  # 4 best match has been fetched

4

In [35]:
# Only print the content
result[0].page_content

'vii\nThe first edition of the Textbook of Medical Physiology was \nwritten by Arthur C. Guyton almost 65 years ago. Unlike \nmost major medical textbooks, which often have 20 or \nmore authors, the first eight editions of the Textbook of \nMedical Physiology were written entirely by Dr. Guyton. \nHe had a gift for communicating complex ideas in a clear \nand interesting manner that made studying physiology \nfun. He wrote the book to help students learn physiology,'

In [38]:
# If you wanna see the scores you can do that 
# Score closer to zero means better result
docs_with_scores = vector_db.similarity_search_with_score(query= query_text)
print(docs_with_scores[0])
print("\n")
print(docs_with_scores[1])

(Document(id='4e19fa27-5056-4c93-80d9-65edf49c8bca', metadata={'source': 'data/Guyton and Hall textbook of medical physiology 14th Edition Emedicodiary.pdf', 'page': 5}, page_content='vii\nThe first edition of the Textbook of Medical Physiology was \nwritten by Arthur C. Guyton almost 65 years ago. Unlike \nmost major medical textbooks, which often have 20 or \nmore authors, the first eight editions of the Textbook of \nMedical Physiology were written entirely by Dr. Guyton. \nHe had a gift for communicating complex ideas in a clear \nand interesting manner that made studying physiology \nfun. He wrote the book to help students learn physiology,'), 0.4532048)


(Document(id='351558b6-339c-4985-990f-eb7092abdc2a', metadata={'source': 'data/Guyton and Hall textbook of medical physiology 14th Edition Emedicodiary.pdf', 'page': 6}, page_content='Onderlinde, Rebecca Gruliow, and the entire Elsevier \nteam for continued editorial and production excellence.\nFinally, we thank the many readers

# Prompt Template

In [86]:
PROMPT_TEMPLATE = """
You are a biomedical expert. Based on the information provided below, answer the question concisely.

Information: {context}

Question: {query_text}

Answer:
"""


# Generate

In [None]:

# Put into prompt
#prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
#Format the Prompt: Insert the context and query into your PROMPT_TEMPLATE to create the final prompt
#prompt = prompt_template.format(context=context, question=query_text)

In [40]:
from huggingface_hub import login
# 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
login() # You will be prompted for your HF key, which will then be saved locally

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [144]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
# model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")
from transformers import BioGptForCausalLM
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt", 
                                          attn_implementation="sdpa", 
                                          torch_dtype=torch.float16,
                                         )

In [174]:
# Experiment
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
# model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")
from transformers import BioGptForCausalLM
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt", 
                                          attn_implementation="sdpa", 
                                          temperature = 0.9,
                                          do_sample=True,
                                          torch_dtype=torch.float16,
                                         )

In [175]:
# Load model directly
# from transformers import AutoModel
# model = AutoModel.from_pretrained("allenai/biomed_roberta_base")
# tokenizer = AutoTokenizer.from_pretrained("allenai/biomed_roberta_base")

In [176]:
# Create a pipeline for text generation
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500, device=0)

In [177]:
# qa_pipe = pipeline("text-generation", model="BioGptForCausalLM", device=0)

In [178]:
from langchain.prompts import PromptTemplate
prompt_template = PromptTemplate(
    input_variables=["context", "query_text"],
    template=PROMPT_TEMPLATE
)

In [179]:
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
# Create a LangChain LLM instance
llm = HuggingFacePipeline(pipeline=pipe)

# Use the LLM in a LangChain application
# chain = llm | prompt_template
chain = LLMChain(prompt=prompt_template, llm=llm)

In [180]:
response = chain.invoke({"context": context, "query_text": query_text})

In [181]:
import re
# print(response.keys())
ans = response.get("text")
#print(ans)
matches = re.search(r"Question:\s*(.*?)\n\nAnswer:\s*(.*)",ans, re.DOTALL)

if matches:
    question = matches.group(1).strip()
    answer = matches.group(2).strip()
    print(f"Question: {question}")
    print(f"Answer: {answer}")

Question: How does pulmonary emphysema affect electrocardiographic potentials?
Answer: Pulmonary emphysema has the potential to affect electrocardiograms mainly through the pericardial effusion.


# Evaluation