In [None]:


from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def load_pdf_files(data):
    loader=DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

extracted_data=load_pdf_files("/Users/prashantagrawal/Prashant/AI/Projects/ Medical Chatbot with LLMs, LangChain, Pinecone, Flask & AWS /-Medical-Chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS-/data")

extracted_data


In [3]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

    

In [4]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [None]:
minimal_docs




[Document(metadata={'source': 'data/Medical_book.pdf'}, page_content='')]

In [None]:

minimal_docs


In [6]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [7]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

                    

Number of chunks: 5859


In [None]:
texts_chunk


In [9]:
import os
os.environ["HF_HOME"] = "./hf_cache"  # store in local folder

from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
         # or "cuda" if GPU
    )
    return embeddings

embedding = download_embeddings()


In [None]:
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings

# Load the underlying transformer manually
st_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Wrap it in LangChain's embedding class
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Inspect
print(st_model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [None]:
vector = embedding.embed_query("Hello world")
vector

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [13]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [14]:
from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [None]:
pc

<pinecone.pinecone.Pinecone at 0x11fce0b50>

In [15]:
from pinecone import ServerlessSpec 

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [16]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)




In [17]:
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [18]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [19]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs


[Document(id='37bda17f-1ba1-4736-87b2-423139e6c8d3', metadata={'source': '/Users/prashantagrawal/Prashant/AI/Projects/ Medical Chatbot with LLMs, LangChain, Pinecone, Flask & AWS /-Medical-Chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS-/data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='a3992f17-6efd-48d5-89a1-c3d42f788ead', metadata={'source': '/Users/prashantagrawal/Prashant/AI/Projects/ Medical Chatbot with LLMs, LangChain, Pinecone, Flask & AWS /-Medical-Chatbot-with-LLMs-LangChain-Pinecone-Flask-AWS-/data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='71fcf257-d5f3-4

In [20]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model="gpt-4o")

In [21]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [22]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model="gpt-4o")
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])