In [58]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [59]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [60]:
extracted_data = load_pdf_files("data")

In [61]:
from typing import List
from langchain_core.documents import Document
def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [64]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [65]:
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=200,
        length_function=len,
    )
    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

In [66]:
text_chunks = text_split(minimal_docs)

In [67]:
from langchain_community.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [68]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [69]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [70]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)

In [71]:
from pinecone import ServerlessSpec
index_name = "medical-chatbot"
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [75]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embedding,
    index_name=index_name
)

In [79]:
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [80]:
#Add more data 
dswith = Document(
    page_content="Ajay pratap singh is a good boy.",
    metadata={"source": "personnel_notes.txt"}
)

In [81]:
docsearch.add_documents(documents=[dswith])

['83350c44-659c-4103-b136-2d0da18c01d5']

In [82]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [83]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='7df450c1-5620-4be6-8131-0364b1d262a7', metadata={'source': 'data/Medical_book.pdf'}, page_content='Acne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nthe most common skin disease. It affects nearly 17 million\npeople in the United States. While acne can arise at any\nage, it usually begins at puberty and worsens during ado-\nlescence. Nearly 85% of people develop acne at some time'),
 Document(id='641fab5c-7ae5-49af-849d-e32b5d95892a', metadata={'source': 'data/Medical_book.pdf'}, page_content='Acne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
