In [20]:
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader  # Loads documents from PDFs and directories
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Splits text into smaller chunks for processing
from langchain.embeddings import HuggingFaceEmbeddings  # Embeds text into vectors
from langchain_google_genai import ChatGoogleGenerativeAI # Generates text
from langchain.chains import create_retrieval_chain # Creates a chain of processing steps
from langchain.chains.combine_documents import create_stuff_documents_chain # Creates a chain of processing steps
from langchain_core.prompts import ChatPromptTemplate # Creates a chain of processing steps

In [2]:
# Function to extract text data from all PDF files in a specified directory
def load_pdf_file(data):
    # Initialize a DirectoryLoader to scan the directory for PDF files
    loader = DirectoryLoader(
        data,               # Path to the directory containing PDF files
        glob="*.pdf",       # Pattern to match only PDF files
        loader_cls=PyPDFLoader  # Specify PyPDFLoader as the file loader
    )

    # Load all matching PDF files and extract their contents as documents
    documents = loader.load()

    return documents  # Return the extracted documents




In [3]:
extracted_data=load_pdf_file(data='/Users/sylviabhoke/Downloads/personal_repos folder/Gen-Ai-medical-chatbot/data')

In [4]:


# Function to split extracted text data into smaller chunks
def text_split(extracted_data):
    
    
    # Initialize a RecursiveCharacterTextSplitter to split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,  # Maximum size of each chunk (in characters)
        chunk_overlap=20  # Overlap between chunks to maintain context
    )
    
    # Split the extracted documents into text chunks
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks  # Return the list of text chunks

# Call the text_split function to process extracted data
text_chunks = text_split(extracted_data)

# Print the total number of text chunks generated
print("Length of Text Chunks:", len(text_chunks))


Length of Text Chunks: 5860


In [5]:


# Function to download pre-trained embeddings from Hugging Face
def download_hugging_face_embeddings():
    """
    Downloads the 'all-MiniLM-L6-v2' sentence-transformer model from Hugging Face 
    and initializes it for generating text embeddings.

    Returns:
        embeddings: An instance of HuggingFaceEmbeddings to generate vector embeddings.
    """

    # Load the sentence transformer model from Hugging Face for embedding generation
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

    return embeddings  # Return the initialized embeddings model

# Call the function to download and initialize the embeddings model
embeddings = download_hugging_face_embeddings()


  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [21]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY')

In [22]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:

from pinecone import Pinecone,ServerlessSpec
import os  # Import os for handling environment variables

# Initialize the Pinecone client using the API key (ensure it's stored securely)
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define the name of the index to be created
index_name = "medicalbot"

# Create a new Pinecone index with the specified configuration
pc.create_index(
    name=index_name,   # Name of the index
    dimension=384,     # Dimensionality of the vector embeddings (should match the model's output dimension)
    metric="cosine",   # Similarity metric to use (options: "cosine", "euclidean", "dotproduct")
    spec=ServerlessSpec(
        cloud="aws",    # Cloud provider where the index will be hosted
        region="us-east-1"  # Specific region for hosting the index
    ) 
)


In [10]:
# Import the PineconeVectorStore module from LangChain's Pinecone integration
from langchain_pinecone import PineconeVectorStore  

# Embed each text chunk and upsert the embeddings into the specified Pinecone index
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,  # The list of text chunks to be embedded
    index_name=index_name,  # The name of the Pinecone index where embeddings will be stored
    embedding=embeddings,   # The embedding function/model used to generate vector representations
)


In [12]:
 

# Load an existing Pinecone index instead of creating a new one
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,  # The name of the pre-existing Pinecone index to connect to
    embedding=embeddings    # The embedding model/function used for querying the stored vectors
)

# Display the loaded Pinecone index object
docsearch  


<langchain_pinecone.vectorstores.PineconeVectorStore at 0x116c06fe0>

In [13]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [14]:
#testing the retriever
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='fa393e7b-98d3-4246-bfe4-29d2a7129c85', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': '/Users/sylviabhoke/Downloads/personal_repos folder/Gen-Ai-medical-chatbot/data/Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='c72fbe4d-780e-4a8d-b1a1-4f2792e607c3', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': '/Users/sylviabhoke/Downloads/personal_repos folder/Gen-Ai-medical-chatbot/data/Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='80b3fccf-7c7e-42a9-a74

In [35]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",  # Use correct model name
    google_api_key=os.getenv("GEMINI_API_KEY"),  # Get key from .env
    temperature=0.4,
    max_output_tokens=500,
    transport="rest"  # <-- THIS is the magic fix!
)

In [36]:
# Create a prompt template for the chatbot
# The template includes a system prompt for providing context to the model
# and a human prompt for accepting user input

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [37]:
# Create a question-answering chain using the provided language model (LLM) and prompt.
question_answer_chain = create_stuff_documents_chain(
    llm,   # The language model used for generating responses
    prompt # The prompt that guides how the model should process retrieved documents
)

# Create a Retrieval-Augmented Generation (RAG) chain
rag_chain = create_retrieval_chain(
    retriever,             # The retriever that fetches relevant documents from the knowledge base
    question_answer_chain  # The Q&A chain responsible for generating responses based on retrieved docs
)


In [38]:
# Invoke the RAG (Retrieval-Augmented Generation) chain with a user query
response = rag_chain.invoke({
    "input": "What is Acromegaly and Gigantism?"  # The query being asked
})

# Extract and print the answer from the response dictionary
print(response["answer"])  # Outputs the generated response based on retrieved documents


Acromegaly is a disorder caused by the abnormal release of a chemical from the pituitary gland.  This leads to increased bone and soft tissue growth, along with other bodily disturbances.  Gigantism is not explicitly defined in the provided text.


In [39]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])

I am sorry, but I do not know what "stats" refers to in this context.  More information is needed.
