In [56]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [57]:
#pdf loader
def pdf_loader(data):
    loader = DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)
    document = loader.load()
    return document

In [60]:
extracted_data = pdf_loader("data")

In [61]:
from typing import List
from langchain_core.documents import Document

def filter_minimal_docs(docs:List[Document])-> List[Document]:
    minimal_doocs:List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_doocs.append(Document(
            page_content=doc.page_content,
            metadata = {"source":src}
        )
    )
    return minimal_doocs

In [62]:
minimal_docs = filter_minimal_docs(extracted_data)

In [63]:
#split the document in smaller chunks

def text_splits(minimal_docs):
    text_spliter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap=20,
    )
    texts_chunk = text_spliter.split_documents(minimal_docs)
    return texts_chunk

In [64]:
texts_chunk = text_splits(minimal_docs)
# print(f"Number of chunks : {len(texts_chunk)}")

In [65]:
from langchain_community.embeddings import HuggingFaceEmbeddings

def download_embedding():
    """Download and return the HuggingFace embeddings model"""
    
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name= model_name
    )
    return embeddings
    
embedding = download_embedding()

In [66]:
vector = embedding.embed_query("Hello How are you doing")

In [67]:
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY=os.getenv("PINE_CONE_API")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ[OPENAI_API_KEY] = OPENAI_API_KEY
os.environ[PINECONE_API_KEY] = PINECONE_API_KEY

In [68]:
from pinecone import Pinecone
pine_cone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pine_cone_api_key)

In [69]:
from pinecone import ServerlessSpec

index_name = "medibot"

if index_name not in index_name:
    pc.create_index(
        name = index_name,
        dimension=384,
        metric="cosine",
        spec = ServerlessSpec(cloud="aws",region="us-east-1")
    )
    
index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore(
    index=index,
    embedding=embedding,
    
)
docsearch.add_documents(documents=texts_chunk)

In [70]:
#Load existing index 
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore(
    index=index, 
    embedding=embedding
)


In [None]:
# # Add more data to the existing pinecone index

# dswith = Document(
#     page_content="hello this is new book",
#     metadata = {"source":"youtube"}
# )

In [None]:
# docsearch.add_documents(documents=[dswith])

['6fad1bbe-d959-4a1c-8760-4d8de64c96e0']

In [71]:
retriever = docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [72]:
retriever_docs = retriever.invoke("what is acne?")

In [73]:
# from langchain_openai import ChatOpenAI

# chat_model = ChatOpenAI(model="gpt-4.1")
# from langchain_google_genai import ChatGoogleGenerativeAI
# chat_model = ChatGoogleGenerativeAI(
#     model="gemini-1.5-flash",
#     temperature=0,
#     google_api_key=OPENAI_API_KEY  # Get from https://aistudio.google.com/app/apikey
# )

from langchain_community.llms import Ollama
# from langchain_ollama import ChatOllam

chat_model = Ollama(model="llama3.2")

In [74]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [75]:
system_prompts = (
    "You are a Medical assistant for question-answering tasks."
    "Use the following pieces of retrieved context ot answer"
    "the question . If you don't know the answer, say that you"
    "don't know . Use three sentences maximum and keep the"
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompts),
        ("human","{input}")
    ]
)

In [76]:
question_answer_chain = create_stuff_documents_chain(chat_model,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [77]:
response = rag_chain.invoke({"input":"what is Acromegaly and gigantism?"})
print(response["answer"])

I can answer that for you. Acromegaly is a disorder characterized by abnormal growth in bone and soft tissue due to an excessive release of a particular chemical from the pituitary gland, leading to various bodily disturbances. This condition results in increased height and other symptoms such as joint pain and hormonal imbalances.
