In [1]:
import os
os.chdir("../")
%pwd

'd:\\ML_Projects\\medical-chatbot\\Medical-Chatbot-using-LLM-RAG-langchain-pinecone-AWS'

In [2]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
        )
    documents = loader.load()
    return documents

In [4]:
extracted_data = load_pdf_file("data")

In [6]:
len(extracted_data)

637

In [7]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={
                    "source": src,
                }
            )
        )
    return minimal_docs
    

In [8]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [10]:
#Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len
    )
    texts_chunks = text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [11]:
texts_chunks = text_split(minimal_docs)

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    return embeddings

In [16]:
embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [82]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [83]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
os.environ["COHERE_API_KEY"] = COHERE_API_KEY


In [84]:
from pinecone import Pinecone
pinecone_api = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api)


In [24]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        )
    )
index = pc.Index(index_name)

In [25]:
from langchain_pinecone import PineconeVectorStore
doc_search = PineconeVectorStore.from_documents(
    documents=texts_chunks,
    index_name=index_name,
    embedding=embedding
)

In [26]:
#if i already have an index, i can use the following code to load it
from langchain_pinecone import PineconeVectorStore
doc_search = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [27]:
#Add more data/documents to the existing index
dswith = Document(
    page_content="This is a new document to be added to the index.",
    metadata={"source": "new_document.pdf"}
)
doc_search.add_documents(documents=[dswith])

['5b37ff50-0e33-425d-8277-0b52a639e5db']

In [50]:
retriever = doc_search.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 3
    }
)

In [51]:
retrived_docs = retriever.invoke("What is the treatment for diabetes?")
retrived_docs

[Document(id='94bfe345-0bf1-4274-882b-c0381cdab4d3', metadata={'source': 'data\\Medical_book.pdf'}, page_content='with a physician or pharmacist before combining tri-\ncyclic antidepressants with any other prescription or non-\nprescription (over-the-counter) medicine.\nNancy Ross-Flanigan\nAntidiabetic drugs\nDefinition\nAntidiabetic drugs are medicines that help control\nblood sugar levels in people with diabetes mellitus\n(sugar diabetes).\nPurpose\nDiabetes may be divided into type I and type II, for-\nmerly termed juvenile onset or insulin-dependent, and\nGALE ENCYCLOPEDIA OF MEDICINE 2 261\nAntidiabetic drugs'),
 Document(id='e3104559-9982-40f3-aebb-225c41ad287c', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Antidiabetic drugs\nGEM - 0001 to 0432 - A  10/22/03 1:42 PM  Page 261'),
 Document(id='2090d8db-b8b1-4e0e-84ac-834ff4bf3007', metadata={'source': 'data\\Medical_book.pdf'}, page_content='National Diabetes Information Clearinghouse. 1 Information\nWay, Bethesd

In [85]:
!pip install langchain-cohere

Collecting langchain-cohere
  Downloading langchain_cohere-0.4.5-py3-none-any.whl.metadata (6.6 kB)
Collecting cohere<6.0,>=5.12.0 (from langchain-cohere)
  Downloading cohere-5.17.0-py3-none-any.whl.metadata (3.4 kB)
Collecting types-pyyaml<7.0.0.0,>=6.0.12.20240917 (from langchain-cohere)
  Downloading types_pyyaml-6.0.12.20250809-py3-none-any.whl.metadata (1.7 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere<6.0,>=5.12.0->langchain-cohere)
  Downloading fastavro-1.12.0-cp310-cp310-win_amd64.whl.metadata (5.7 kB)
Collecting httpx-sse==0.4.0 (from cohere<6.0,>=5.12.0->langchain-cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere<6.0,>=5.12.0->langchain-cohere)
  Downloading types_requests-2.32.4.20250809-py3-none-any.whl.metadata (2.0 kB)
Downloading langchain_cohere-0.4.5-py3-none-any.whl (42 kB)
Downloading cohere-5.17.0-py3-none-any.whl (295 kB)
Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Download

In [None]:
from langchain_cohere import ChatCohere
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

llm = ChatCohere(model="command-r", temperature=0)

system_prompt = (
   "You are an medical assistant for question answering tasks."
   "Use the following pieces of retrieved context to answer "
   "the question. If you don't know the answer, just say that you don't know, don't try to make up an answer."
   "Use three sentences and keep the answer concise."
   "\n\n"
   "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [89]:
response = rag_chain.invoke({"input": "What is the Acne?"})
print(response['answer'])

Acne is a skin disease characterized by pimples, usually on the face, chest, and back. It occurs when skin pores become blocked with oil, dead skin cells, and bacteria. This disease is known as a common and treatable condition, affecting nearly 17 million people in the United States.
