In [4]:
import os
%pwd
os.chdir("../")

In [5]:
%pwd

'd:\\ML_Projects\\medical-chatbot\\Medical-Chatbot-using-LLM-RAG-langchain-pinecone-AWS'

In [6]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
        )
    documents = loader.load()
    return documents

In [8]:
extracted_data = load_pdf_file("data")

In [9]:
len(extracted_data)

870

In [10]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={
                    "source": src,
                }
            )
        )
    return minimal_docs
    

In [11]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [12]:
minimal_docs

[Document(metadata={'source': 'data\\Advances_in_Distributed_Computing_and_Artificial_Intelligence_Journal.pdf'}, page_content='Advances in Distributed Computing\nand Artificial Intelligence Journal\nDiscipline Artificial intelligence, distributed\ncomputing\nLanguage English\nEdited by Juan M. Corchado, Sigeru\nOmatu\nPublication details\nHistory 2012–present\nPublisher Ediciones Universidad de\nSalamanca (Spain)\nFrequency Continuous\nOpen access Yes\nLicense CC BY\nImpact factor 1.7 (2023)\nStandard abbreviations\nISO 4 Adv. Distrib. Comput. Artif.\nIntell. J.\nIndexing\nISSN 2255-2863 (https://www.worldca\nt.org/search?fq=x0:jrnl&q=n2:22\n55-2863) (print)\n2255-2863 (https://www.worldca\nt.org/search?fq=x0:jrnl&q=n2:22\n55-2863) (web)\nOCLC no. 862779541 (https://www.worldc\nat.org/oclc/862779541)\nLinks\nJournal homepage (https://adcaij.usal.es)\nOnline access to volumes (https://revistas.u\nsal.es/cinco/index.php/2255-2863/index)\nGuidelines for authors (https://adcaij.usal.e\ns/

In [13]:
#Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len
    )
    texts_chunks = text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [14]:
texts_chunks = text_split(minimal_docs)

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    return embeddings

In [16]:
embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [17]:
from dotenv import load_dotenv
import os
load_dotenv(override=True)

True

In [18]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
os.environ["COHERE_API_KEY"] = COHERE_API_KEY


In [19]:
PINECONE_API_KEY

'pcsk_tFpQk_5LhSYVzfwcyn4XGBKs2XCMScdATagKFy9GYzfGwu8vmTfM5uPJPKUvjRdNxnHEn'

In [20]:
from pinecone import Pinecone
pinecone_api = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api)


In [21]:
from pinecone import ServerlessSpec

index_name = "ai-chatbot"
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        )
    )
index = pc.Index(index_name)

In [22]:
from langchain_pinecone import PineconeVectorStore
doc_search = PineconeVectorStore.from_documents(
    documents=texts_chunks,
    index_name=index_name,
    embedding=embedding
)

In [23]:
#if i already have an index, i can use the following code to load it
from langchain_pinecone import PineconeVectorStore
doc_search = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [24]:
#Add more data/documents to the existing index
dswith = Document(
    page_content="This is a new document to be added to the index.",
    metadata={"source": "new_document.pdf"}
)
doc_search.add_documents(documents=[dswith])

['3e281d48-670c-4f26-a2e9-58ba57000868']

In [25]:
retriever = doc_search.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 3
    }
)

In [26]:
retrived_docs = retriever.invoke("What is AI?")
retrived_docs

[Document(id='559aad3f-4f20-4d73-884f-b80346600dc0', metadata={'source': 'data\\Hands-On Generative AI with Transformers and Diffusion.pdf'}, page_content="('TITLE  II \\n'\n 'PROHIBITED  ARTIFICIAL  INTELLIGENCE  \nPRACTICES  \\n'\n 'Article 5  \\n'\n '1. The following artificial intelligence"),
 Document(id='e1d376f0-483a-42ae-938f-60087de5139e', metadata={'source': 'data\\Generative_Adversarial_Neural_Networks_and_Deep_Le (1).pdf'}, page_content='results at doing science [2 -6]. AI system s are used as an \neffective mechanism in diverse scientific fields transforming \nconventional research practices and expediting discoveries. \nThe main advantage of AI is that it can outperform humans \nwhen it comes to processing large amounts of data, detecting \npatterns and abnormalities that human experts could never \nhave spotted.  \nFig. 1 demonstrates the integrated liaisons between key \nelements of AI. \nC'),
 Document(id='97198f01-ecf9-4975-814f-ea3dd96150b5', metadata={'source': 'dat

In [85]:
!pip install langchain-cohere

Collecting langchain-cohere
  Downloading langchain_cohere-0.4.5-py3-none-any.whl.metadata (6.6 kB)
Collecting cohere<6.0,>=5.12.0 (from langchain-cohere)
  Downloading cohere-5.17.0-py3-none-any.whl.metadata (3.4 kB)
Collecting types-pyyaml<7.0.0.0,>=6.0.12.20240917 (from langchain-cohere)
  Downloading types_pyyaml-6.0.12.20250809-py3-none-any.whl.metadata (1.7 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere<6.0,>=5.12.0->langchain-cohere)
  Downloading fastavro-1.12.0-cp310-cp310-win_amd64.whl.metadata (5.7 kB)
Collecting httpx-sse==0.4.0 (from cohere<6.0,>=5.12.0->langchain-cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere<6.0,>=5.12.0->langchain-cohere)
  Downloading types_requests-2.32.4.20250809-py3-none-any.whl.metadata (2.0 kB)
Downloading langchain_cohere-0.4.5-py3-none-any.whl (42 kB)
Downloading cohere-5.17.0-py3-none-any.whl (295 kB)
Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Download

In [27]:
from langchain_cohere import ChatCohere
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

llm = ChatCohere(model="command-r", temperature=0)

system_prompt = (
   "You are an medical assistant for question answering tasks."
   "Use the following pieces of retrieved context to answer "
   "the question. If you don't know the answer, just say that you don't know, don't try to make up an answer."
   "Use three sentences and keep the answer concise."
   "\n\n"
   "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [28]:
response = rag_chain.invoke({"input": "What is the AI?"})
print(response['answer'])

AI stands for Artificial Intelligence, which refers to the ability of machines or systems to mimic human intelligence and perform tasks that typically require cognitive functions. It encompasses a wide range of technologies that can process and analyze vast amounts of data, detect patterns, and make decisions or provide recommendations. AI has become an invaluable tool in various scientific fields, aiding researchers in their quest for new discoveries and advancements.
