In [1]:
print("hello")

hello


In [2]:
%pwd

'e:\\LLM\\Medical_ChatBot_Project\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'e:\\LLM\\Medical_ChatBot_Project'

In [5]:
import os

# Disable symlink warning from Hugging Face Hub (useful in environments like Windows where symlinks can cause issues)
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"


In [6]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader # PDF loader to read and extract text from PDF documents
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Text splitter to break documents into smaller chunks for processing


In [7]:
# 📄 Function to extract data from all PDF files in a directory
def load_pdf_file(data):
    # Initialize a DirectoryLoader to load all PDF files matching the pattern in the specified folder
    loader = DirectoryLoader(
        data,                       # Path to the directory containing PDF files
        glob="*.pdf",               # Only match files with .pdf extension
        loader_cls=PyPDFLoader      # Use PyPDFLoader to parse and extract text from the PDF files
    )

    # Load all the documents from the directory
    documents = loader.load()
    
    # Return the list of loaded documents (each representing content from one PDF)
    return documents


In [8]:
# Load and extract data from all PDF files located in the "Data/" directory
extracted_data = load_pdf_file(data="Data/")

# View number of documents loaded
print(f"Number of documents loaded: {len(extracted_data)}")

# Preview of the first document
print("\nPreview of the first document:\n")
print(extracted_data[0])

# Safely print the 110th page (if it exists)
if len(extracted_data) > 110:
    print(f"\nFirst 500 characters of the 111th document:\n{extracted_data[110].page_content[:500]}")
else:
    print(f"\nThere are only {len(extracted_data)} documents. Cannot access the 111th document.")

# Metadata of the first document
print("\nMetadata of the first document:")
print(extracted_data[0].metadata)



Number of documents loaded: 637

Preview of the first document:

page_content='' metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}

First 500 characters of the 111th document:
• having large blocks of time taken up by alcohol use
• choosing to drink at the expense of other important
tasks or activities
• drinking despite evidence of negative effects on one’s
health, relationships, education, or job
Alcohol abuse requires that one of the following four
criteria is met. Because of drinking, a person repeatedly:
• fails to live up to his or her most important responsibili-
ties
• physically endangers him or herself, or others (for
example, by drinking when driving)
• get

Metadata of the first document:
{'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-0

In [9]:
# Split the extracted PDF documents into smaller text chunks
def text_split(data=extracted_data, chunk_size=500, chunk_overlap=20):
    """
    Splits input documents into smaller chunks for processing by LLMs.

    Args:
        data (list): List of documents loaded from PDFs.
        chunk_size (int): Maximum number of characters in a chunk.
        chunk_overlap (int): Number of overlapping characters between chunks.

    Returns:
        List of split text chunks.
    """
    # Initialize the text splitter with specified chunk size and overlap
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    # Split the documents into text chunks
    text_chunks = text_splitter.split_documents(data)
    
    return text_chunks


In [10]:
# Split the extracted documents into smaller chunks
text_chunks = text_split(extracted_data)

# Print the total number of chunks created
print(f"Length of Text Chunks is : {len(text_chunks)}")


Length of Text Chunks is : 5859


In [11]:
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize the embedding model from Hugging Face
def download_huggingface_embeddings():
    # You can customize model_name or add model_kwargs like device='cuda' for GPU usage
    embed_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    return embed_model


In [12]:
# Initialize the embedding model
embed_model = download_huggingface_embeddings()

# Generate the embedding for the first text chunk
embedding_vector = embed_model.embed_query(text_chunks[0].page_content)

# Print the length of the embedding vector
print(f"Length of embedding vector: {len(embedding_vector)}")


  from .autonotebook import tqdm as notebook_tqdm


Length of embedding vector: 384


In [13]:
# query_result

In [14]:
from pinecone.grpc import PineconeGRPC  # Import Pinecone client for gRPC API
from pinecone import ServerlessSpec     # Import serverless deployment spec for Pinecone
import os                               # Import os to access environment variables

# Retrieve Pinecone API key from environment variable
api_key = os.getenv("PINECONE_API_KEY")

# Initialize the Pinecone client with the API key
pc = PineconeGRPC(api_key=api_key)

# Define the index name (must be lowercase and can include hyphens)
index_name = "medical-chatbot"

# Check if the index already exists to avoid duplication
if not pc.has_index(index_name):
    # Create a new Pinecone index with specified parameters
    pc.create_index( 
        name=index_name,
        dimension=384,              # Dimensionality of vectors (e.g., from MiniLM or similar)
        metric="cosine",            # Similarity metric to use (cosine distance)
        spec=ServerlessSpec(
            cloud="aws",            # Cloud provider
            region="us-east-1"      # Region where the index is hosted
        )
    )


In [15]:
# embed each chunk and upsert the embeddings into your Pinecone Index.

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embed_model
)

In [16]:
#Load existing index
# Embed each chunk and upsert the embeddings into Pinecone Index

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embed_model
)

In [17]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1baac0c3940>

In [18]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})
retrieved_docs = retriever.invoke("What is Bronchiectasis")
retrieved_docs

[Document(id='3b145818-8d34-4fb6-a80f-2d73b7210b13', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 609.0, 'page_label': '610', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2596\nBronchiectasis\nGEM -0433 to 0624 - B  10/22/03 6:09 PM  Page 596'),
 Document(id='d1338333-3e60-41c7-bd77-f98a4e80c4b4', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 610.0, 'page_label': '611', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='(Feb. 1995): 149+.\nNicotra, M. Brooke, et al. “Clinical, Pathophysiologic, and\nMicrobiologic Characterization of Bronchiectasis in an\nAging Cohort.” Chest 108 (Oct. 1995): 955+.\nWeinberger, Steven E., and Ann Giudici Fettner. “Disease in\nDisguise: Bronchie

In [19]:
from langchain_mistralai import ChatMistralAI

llm = ChatMistralAI(
    model="mistral-tiny",          # or mistral-small, mistral-medium, etc.
    temperature=0.4,
    max_tokens=500
)


In [63]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

'''
system_prompt = (
    "You are a highly accurate and rule-following medical assistant.\n"
    "You will ONLY answer questions using the provided context.\n"
    "If the answer is NOT found in the context, reply with exactly:\n"
    "\"I'm sorry, the information is not available.\"\n"
    "Do not paraphrase, do not reword, do not explain further.\n"
    "Do not use prior knowledge.\n"
    "Answer strictly based on the context, using no more than 3 medically accurate sentences.\n\n"
    "Context:\n{context}"
)
'''
system_prompt = (
    "You are a highly accurate and rule-following medical assistant.\n"
    "If the answer isn't in the context, reply with exactly:\n"
    "I am not aware!.\n"
    "and Do not paraphrase Do not print a single word further.\n"
    "Stick to Context.\n"
    "Answer strictly based on the context, using no more than 3 medically accurate sentences.\n\n"
    "Context:\n{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain=question_answer_chain
)


In [58]:
response = rag_chain.invoke({"input": "What are Antiseptics"})
print(response["answer"])

Antiseptics are substances that inhibit the growth and development of microorganisms. They are used for wound surfaces after injury, skin preparation prior to injections or surgical procedures, oral hygiene, and disinfection of inanimate objects. Examples of commonly used antiseptics include benzalkonium chloride, chlorhexidine, iodine compounds, and alcohol.


In [65]:
response = rag_chain.invoke({"input": "What is BreakDance ?"})
print(response["answer"])

I am not aware!.


I am not aware! (This question is not related to the context provided.)
