In [1]:
%pwd

'/Users/nambp/Nambp/CodeLearning/Personal Project/Medical-Chatbot/research'

In [2]:
import os 
os.chdir("/Users/nambp/Nambp/CodeLearning/Personal Project/Medical-Chatbot")

In [3]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def load_pdf_files(data):
    loader = DirectoryLoader(data, glob="**/*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [5]:
extracted_data = load_pdf_files("data")

In [6]:
len(extracted_data)

637

In [7]:
from typing import List
from langchain.schema import Document

def filer_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs = []
    for doc in docs:
        minimal_doc = Document(page_content=doc.page_content)
        minimal_docs.append(minimal_doc)
    return minimal_docs

In [8]:
minimal_docs = filer_to_minimal_docs(extracted_data)

In [9]:
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [10]:
texts_chunk = text_split(minimal_docs)
len(texts_chunk)

5859

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [12]:
vector = embeddings.embed_query("This is a sample document to be embedded.")

In [13]:
vector

[-0.032749857753515244,
 0.09272836148738861,
 -0.036308299750089645,
 0.06517162919044495,
 0.07537245750427246,
 0.026154080405831337,
 -0.10475226491689682,
 0.06296800076961517,
 0.016258101910352707,
 0.026199115440249443,
 0.009630465880036354,
 0.07525002211332321,
 0.04392101243138313,
 -0.010794930160045624,
 -0.026725249364972115,
 0.031363554298877716,
 -0.005026628263294697,
 -0.00013609055895358324,
 0.028000880032777786,
 0.05784617364406586,
 0.00802901852875948,
 0.07039354741573334,
 0.08912189304828644,
 -0.0663127526640892,
 0.02782600000500679,
 -0.007900525815784931,
 -0.055543504655361176,
 0.052380915731191635,
 0.07469726353883743,
 -0.016713766381144524,
 0.036889832466840744,
 0.011710748076438904,
 0.12367403507232666,
 0.02090667188167572,
 0.10600762814283371,
 0.05791698396205902,
 0.019614890217781067,
 0.02679792232811451,
 -0.0005944397416897118,
 0.03156352415680885,
 -0.008420921862125397,
 -0.0761447623372078,
 0.008382583037018776,
 0.02543093264102

In [14]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [15]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [16]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [17]:
pc

<pinecone.pinecone.Pinecone at 0x13d001db0>

In [18]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name, 
        dimension=384, 
        metric= "cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [19]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embeddings,
    index_name=index_name,
)

In [20]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)

In [21]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [22]:
retrieved_docs = docsearch.similarity_search("What is diabetes?")
retrieved_docs

[Document(id='f3fc72a3-d408-4626-a54b-7e83fa8d5739', metadata={}, page_content='begin to fall. A person with diabetes mellitus either does\nnot make enough insulin, or makes insulin that does not\nwork properly. The result is blood sugar that remains\nhigh, a condition called hyperglycemia.\nDiabetes must be diagnosed as early as possible. If\nleft untreated, it can damage or cause failure of the eyes,\nkidneys, nerves, heart, blood vessels, and other body\norgans. Hypoglycemia, or low blood sugar, may also be\ndiscovered through blood sugar testing. Hypoglycemia is'),
 Document(id='d1435686-ae79-4c54-8c94-7cb87f504bbe', metadata={}, page_content='begin to fall. A person with diabetes mellitus either does\nnot make enough insulin, or makes insulin that does not\nwork properly. The result is blood sugar that remains\nhigh, a condition called hyperglycemia.\nDiabetes must be diagnosed as early as possible. If\nleft untreated, it can damage or cause failure of the eyes,\nkidneys, nerves, 

In [23]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model_name="gpt-4o")


In [24]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [25]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [26]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [27]:
response = rag_chain.invoke({"input": "What is diabetes?"})
print(response["answer"])

Diabetes mellitus is a condition where the body either does not produce enough insulin or the insulin produced does not work properly, resulting in high blood sugar levels, known as hyperglycemia. If untreated, diabetes can lead to damage or failure of various organs, including the eyes, kidneys, nerves, heart, and blood vessels. Identifying and managing diabetes early is crucial to preventing complications.
