In [20]:
from langchain.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain.schema import Document
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
import os


In [113]:

load_dotenv()


True

In [63]:
HF_KEY = os.getenv("HUGGINGFACEHUB_ACCESS_TOKEN")

In [22]:
#1. Load PDF document
loader = PyPDFLoader('/Users/anushkabansal/Desktop/RAG Chatbot Project/The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf')
docs = loader.load()

In [23]:
#2. Split text into smaller chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=900,
    chunk_overlap=50,
)
chunks = splitter.split_documents(docs)

In [25]:
#3. Create embeddings and store in Chroma vector database
embedder = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vector_store = Chroma(
    embedding_function=embedder,
    persist_directory='my_chroma_db_files1',
    collection_name='sample'
)

vector_store.add_documents(chunks)

['73b57971-ae1a-41f5-a007-a046cb338396',
 '7c59aa6c-d96b-42e9-bb8b-fe1f35612bf6',
 'caad9b2d-034b-4fca-bf2e-b55701dc2f65',
 'fa98af0e-04a6-4600-8623-8a6717bc0eb8',
 '9fc1de5e-9788-4d45-b008-1d7cf713f583',
 '7982ee0a-aaa6-4e9c-a210-17ee5298db63',
 '1ffbacb1-eda6-4fef-b2ec-f7e02dfd101a',
 '6c4354e7-84c0-4347-b0cc-e706016c3d0e',
 '9481adbe-7f3d-4069-a832-3d2b142398fd',
 '1a9e1977-b5a4-48bd-acd1-f4076c37f2be',
 '74c60610-37da-410f-bd3e-2a7b2d0245d3',
 'dda218cc-b0ec-4fba-97eb-51e4e1c3d06a',
 '608eb353-14b2-4c31-8114-46f92502e1e3',
 'ba11af3b-55e8-4db4-b099-7bc6044eef83',
 'ac4b317c-d0ce-4240-a530-8d760b6fbcd0',
 '63072684-022e-440c-a94d-4e25558fa83c',
 '4d8ca04d-524f-4ec5-84a3-5cce0149c252',
 'f427e16b-5728-4d26-aea8-709616498f71',
 'fc166d55-2e75-487c-9ae2-cdb8170c121c',
 '59c181c4-72cf-474f-86ae-b317217ddac5',
 '7eb0df65-7f78-4ad6-a8d9-002af621fec9',
 'd9a90caf-9395-4f1a-bd93-a4ab2cba90ae',
 '98d2ac93-17fa-44b6-930b-adb5ee50765d',
 '30a71070-dd44-4600-b15f-6c3c7ce7b3bb',
 '57d74ad4-9488-

In [26]:
#4. Setup retriever with Multiquery

retriever = vector_store.as_retriever(
    search_type="mmr",                   # <-- This enables MMR
    search_kwargs={"k": 3, "lambda_mult": 0.75}  # k = top results, lambda_mult = relevance-diversity balance
)

In [None]:
#5. Creating a promt template

prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [96]:
question="what is cancer?"
retrieved_docs    = retriever.invoke(question)

context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

'In a healthy individual, the immune system can rec-\nognize the neoplastic cells and destroy them before they\nget a chance to divide. However, some mutant cells may\nescape immune detection and survive to become tumors\nor cancers.\nTumors are of two types, benign or malignant. A\nbenign tumor is not considered cancer. It is slow growing,\ndoes not spread or invade surrounding tissue, and once it\nis removed, it doesn’t usually recur. A malignant tumor,\non the other hand, is cancer. It invades surrounding tissue\nand spreads to other parts of the body. If the cancer cells\nhave spread to the surrounding tissues, then, even after\nthe malignant tumor is removed, it generally recurs.\nA majority of cancers are caused by changes in the\ncell’s DNA because of damage due to the environment.\nEnvironmental factors that are responsible for causing\nthe initial mutation in the DNA are called carcinogens,\n\nCancer, by definition, is a disease of the genes. A\ngene is a small part of DNA, wh

In [97]:
final_prompt = prompt.invoke({"context": context_text, "question": question})

In [98]:
type(final_prompt)

langchain_core.prompt_values.StringPromptValue

In [114]:
#5. Setup LLM

from langchain_groq import ChatGroq

GROQ_API_KEY = os.environ.get("GROQ_API_KEY")

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    api_key=os.getenv("GROQ_API_KEY"),
    temperature=0.7,
    max_tokens=500
)

In [116]:
answer = llm.invoke("what is cancer")
print(answer.content)

Cancer is a group of diseases characterized by the uncontrolled growth and spread of abnormal cells in the body. It occurs when genetic mutations or changes occur in the DNA of cells, leading to their uncontrolled division and multiplication. Cancer cells can invade and destroy surrounding tissues, disrupt normal bodily functions, and eventually lead to death.

**Types of Cancer:**

There are over 100 different types of cancer, including:

1. **Carcinomas**: The most common type of cancer, which begins in epithelial cells (the lining of organs and glands).
2. **Sarcomas**: Cancer that begins in connective tissue, such as bone, cartilage, fat, or muscle.
3. **Leukemias**: Cancer of the blood and bone marrow.
4. **Lymphomas**: Cancer of the immune system, including Hodgkin's lymphoma and non-Hodgkin's lymphoma.
5. **Brain and spinal cord tumors**: Cancer that begins in the brain or spinal cord.

**Causes and Risk Factors:**

While the exact causes of cancer are still not fully understood

In [117]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [118]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [119]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [120]:
parser = StrOutputParser()

In [121]:
main_chain = parallel_chain | prompt | llm | parser

In [125]:
main_chain.invoke('what are the side effects of caffieine?')

'At recommended doses, caffeine can cause the following side effects: \n- restlessness\n- irritability\n- nervousness\n- shakiness\n- headache\n- light-headedness\n- sleeplessness\n- nausea\n- vomiting\n- upset stomach.\n\nAt higher than recommended doses, caffeine can cause more severe side effects, including:\n- excitement\n- agitation\n- anxiety\n- confusion\n- a sensation of light flashing before the eyes\n- unusual sensitivity to touch\n- unusual sensitivity of other senses\n- ringing in the ears\n- frequent urination\n- muscle twitches or tremors\n- heart arrhythmias\n- rapid heartbeat\n- flushing\n- convulsions.'