In [None]:
import os
os.chdir("../")

In [None]:
%pwd

In [None]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
#Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob = "*.pdf",
        loader_cls = PyPDFLoader
    )

    documents = loader.load()
    return documents

In [None]:
extract_text = load_pdf_files("data")

In [None]:
extract_text

In [None]:
len(extract_text)

In [None]:
from typing import List

# Works across LangChain versions
try:
    from langchain.schema import Document  # older path
except ImportError:
    from langchain_core.documents import Document  # newer path


def filter_to_minimal_doc(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Documents
    containing only the original page_content and minimal metadata:
      - 'source'
      - 'author' (if available: from 'author')
      - 'page' (if available: from 'page', 'page_number', or 'loc.page_number')
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        meta_in = doc.metadata or {}

        # Extract source if present
        source = meta_in.get("source")

        # Extract author if present
        author = meta_in.get("author")

        # Extract page from several common patterns
        page = None
        if "page" in meta_in:
            page = meta_in["page"]
        elif "page_number" in meta_in:
            page = meta_in["page_number"]
        elif isinstance(meta_in.get("loc"), dict) and "page_number" in meta_in["loc"]:
            page = meta_in["loc"]["page_number"]

        # Build minimal metadata dict (omit missing keys)
        meta_out = {}
        if source is not None:
            meta_out["source"] = source
        if author is not None:
            meta_out["author"] = author
        if page is not None:
            meta_out["page"] = page

        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata=meta_out,
            )
        )
    return minimal_docs

In [None]:
minimal_docs = filter_to_minimal_doc(extract_text)

In [None]:
minimal_docs

In [None]:
#split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [None]:
texts_chunk = text_split(minimal_docs)

In [None]:
len(texts_chunk)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and Return HuggingFace Embeddings Model
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
    )   
    return embeddings

embedding = download_embeddings()


In [None]:
embedding

In [None]:
vector = embedding.embed_query("Hello world")

In [None]:
vector

In [None]:
print(len(vector))

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [None]:
pc

In [None]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384,#Dimension of the embedding vector
        metric="cosine", #Cosine similarity requires fixed-length vectors
        spec = ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = texts_chunk,
    embedding = embedding,
    index_name = index_name,
)

In [None]:
#Load existing index

from langchain_pinecone import PineconeVectorStore
#Embed each chunck and upsert the embeddings to Pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    embedding = embedding,
    index_name = index_name,
)


Add more data to existing PineCone index

In [None]:
dswith = Document(
    page_content="This is a new document to add to the existing index.", 
    metadata={"source": "YouTube"}
)

In [None]:
docsearch.add_documents([dswith]) 

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrieved_docs = retriever.invoke("What is Ayurveda?")
retrieved_docs

In [None]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model_name="gpt-4o")

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prompt = (
    "**CRITICAL CORE DIRECTIVE:** You **MUST** base your entire answer **exclusively** on the text provided in the {context}. "
    "You **MUST NOT** use any outside knowledge, personal opinions, or information from your general training. "
    "Every factual statement you make **MUST** be derivable from the provided {context}."
    "\n\n"
    "You are 'AyuCare,' an empathetic and concise AI guide for Ayurvedic wellness. "
    "Your persona is warm, understanding, and supportive. "
    "Your primary goal is to be an **educational partner**, not a medical authority. "
    "You are an assistant for Ayurvedic **education**."
    "\n\n"
    "**SAFETY GUARDRAILS (NON-NEGOTIABLE):**\n"
    "1. **DO NOT DIAGNOSE:** You must **NEVER** say 'You have...' or 'This sounds like...' You must not diagnose.\n"
    "1b. **DO NOT PRESCRIBE FROM CASES:** When referencing a validated case, you are describing an *educational example*, not giving the user a prescription. Never tell the user 'You should do what this case did.'\n"
    "2. **ALWAYS REFER:** Every single conversation about a health concern **MUST** end with a referral to a certified practitioner.\n"
    "3. **STAY IN SCOPE:** If the user's question cannot be answered using *only* the provided {context}, you **MUST** state that you do not have that specific information in your knowledge base. **DO NOT** try to answer from general knowledge.\n"
    "4. **SRI LANKAN CONTEXT:** You can mention local Sri Lankan practices **ONLY IF** they are explicitly mentioned in the retrieved {context}."
    "\n\n"
    "**Your Interaction Flow:**\n"
    "1. **Be Concise:** Your initial answers should be brief and clear (2-3 sentences).\n"
    "2. **Acknowledge & Clarify:** When a user expresses a health concern (e.g., 'I have a headache'), be empathetic. (Empathy does not require a citation). Ask gentle, clarifying questions to understand their context.\n"
    "3. **Go Deeper on Request:** If the user asks for more detail (e.g., 'tell me more,' 'why?'), use the retrieved {context} to provide a more comprehensive explanation, citing as you go.\n"
    "4. **Cite Your Factual Information:** You **MUST** cite the source for any specific factual claim, principle, recipe, or quote you provide using information from the {context}'s metadata. "
    "Conversational phrases do not require a citation. "
    "The citation **MUST** follow this exact format: "
    "**(Source: [Source Title], Author: [Author Name], Page: [Page Number])**. "
    "If the author or page is not available in the metadata, omit that part from the citation.\n"
    
    "5. **Use Validated Cases as Examples:** If the retrieved {context} includes an anonymized consultation case where the metadata field 'validated' is True, you can use it as a *general example* to illustrate an Ayurvedic principle. **DO NOT** present this as direct advice to the user. Cite it appropriately (e.g., '(Source: Validated Consultation Case)').\n"
    
    "6. **Suggest *Potential* Actions (Safely):** You can suggest general, low-risk, traditional actions *only if* they are explicitly mentioned in the retrieved {context}. This is educational advice, **NOT** a prescription. Ensure you cite the source for the suggestion as per rule 4.\n"
    "7. **THE CRITICAL RULE:** **Immediately after** suggesting any action or providing information related to a health concern, you **MUST** state in a new paragraph that this is **NOT** medical diagnosis, only educational advice, and that they **MUST** speak to a certified practitioner for a proper diagnosis.\n\n"
    
    "This is used in a medical context, so safety is paramount. "
    "Use the following pieces of retrieved context to answer the question. If you don't know the answer based *only* on the context, say that you "
    "don't know. Keep your answer concise."
    "\n\n"
    "Context:\n{context}"
)



prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(chatModel,prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "My ankle is swollen what do i do?"})
print(response['answer'])

In [None]:
response = rag_chain.invoke({"input": "What are the main Ayurveda princ to live a healthy life?"})
print(response['answer'])