In [20]:
print("Hello, World!")

Hello, World!


In [21]:
%pwd

'c:\\COLLEGE\\SEM X\\Frelancing_Work\\MyLearningsandProjectsAI\\Medical-Chatbot\\research'

In [22]:
import os
os.chdir("../")


In [23]:
%pwd

'c:\\COLLEGE\\SEM X\\Frelancing_Work\\MyLearningsandProjectsAI\\Medical-Chatbot'

In [24]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [25]:
#Extract text from PDF files 
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [26]:
extracted_data = load_pdf_files("data")

In [27]:
#No. of pages in the pdf
len(extracted_data)

637

In [28]:
#Filter Operation
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs : List[Document]) -> List[Document]:
    """
    Given a list of document objects, return a new list of document objects containing only
    'source' in metadata and the original page content. 
    """
    minimal_docs: List[Document]= []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content = doc.page_content,
                metadata = {"source": src}
            )
        )
    return minimal_docs
        
    

In [29]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs[0]

Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='')

In [30]:
minimal_docs[1]

Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION')

In [31]:
#Split the document into smaller chunks
def text_split(minimal_docs : List[Document]) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size =  500,
        chunk_overlap = 20
    )
    texts_chunks = text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [32]:
text_chunk = text_split(minimal_docs)
print(f"Number of text chunks: {len(text_chunk)}")

Number of text chunks: 5859


In [33]:
import torch
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    """
    Download and return the hugging face embedding model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=model_name,
                                       model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"})
    return embeddings
embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name=model_name,


In [33]:
vector = embeddings.embed_query("Hello, how are you?")
print(vector)
print(len(vector))

[0.019096748903393745, 0.03446517512202263, 0.09162798523902893, 0.07016526907682419, -0.029946597293019295, -0.08419137448072433, 0.04581356421113014, 0.004958590492606163, -0.09189331531524658, 0.01740063913166523, -0.00881615187972784, -0.0006614578305743635, -0.02855696901679039, -0.021949712187051773, 0.05516669154167175, -0.049836501479148865, 0.08988095074892044, -0.08895706385374069, -0.11235623806715012, 0.03900053724646568, -0.06607074290513992, 0.02609514445066452, 0.03653070330619812, 0.06139037013053894, -0.05712487921118736, -0.05463935807347298, 0.03036552667617798, 0.03238753601908684, 0.012644710019230843, -0.1056857705116272, -0.05834552273154259, 0.06732939928770065, -0.04075591266155243, 0.006439837161451578, 0.005698689725250006, 0.05285317078232765, -0.0397753082215786, -0.11855248361825943, 0.0021161921322345734, -0.016692863777279854, 0.0283381175249815, -0.03743794187903404, -0.021371405571699142, -0.04147521033883095, 0.08497177809476852, -0.06869424879550934,

In [34]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [35]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

#Once we get them , we also need to set them as environment variables
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [36]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [46]:
pc

<pinecone.pinecone.Pinecone at 0x1dac2753380>

In [55]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384, #Dimensions of the embeddings.
        metric = "cosine",
        spec = ServerlessSpec(cloud="aws", region="us-east-1")
    )
    
index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunk,
    embedding=embeddings,   
    index_name=index_name
)s

In [38]:
#Load Existing Index
index_name = "medical-chatbot"
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [62]:
#Add more data to existing pinecone index
dswith = Document(
    page_content = "New medical document content goes here.",
    metadata = {"source": "data\\Medical_book.pdf"}
    
)

In [63]:
docsearch.add_documents([dswith])

['808a914c-e42d-444c-8f5b-c4ced2bcf908']

In [39]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [65]:
retrieved_docs = retriever.invoke("What is hypertension?")

In [66]:
retrieved_docs

[Document(id='5ee4cd2b-c66f-4f52-8e81-c95ff739fa7f', metadata={'source': 'data\\Medical_book.pdf'}, page_content='(BPH), a condition that affects men and is characterized\nby an enlarged prostate gland.\nHigh blood pressure\nHigh blood pressure puts a strain on the heart and\nthe arteries. Over time, hypertension can damage the\nblood vessels to the point of causing stroke, heart fail-\nure or kidney failure. People with high blood pressure\nmay also be at higher risk for heart attacks. Controlling\nhigh blood pressure makes these problems less likely.\nAlpha blockers help lower blood pressure by causing'),
 Document(id='3bddc72c-3836-4b2b-8ba8-b12929807a98', metadata={'source': 'data\\Medical_book.pdf'}, page_content='heart and lungs.\n• Seek treatment for hypertension—High blood pressure\ncan be controlled through lifestyle changes—reducing\nsodium and fat, exercising, managing stress, quitting\nKEY TERMS\nArteriosclerosis —Hardening of the arteries. It\nincludes atherosclerosis, but

In [40]:
from langchain_openai import ChatOpenAI
chatModel = ChatOpenAI(
    model_name="gpt-4o",
    temperature=0
)

In [41]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [42]:
system_prompt = (
    "You are an Medical Assistant for question-answering task."
    "Use the following pieces o reterieved congtent to answer "
    "the question. If you dont know the answer, say you dont know."
    "Use three sentance maximum and keep the"
    "answer consise."
    "\n\n"
    "{context}"
)

In [43]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human","{input}"),
])

In [44]:
question_answer_chain= create_stuff_documents_chain(chatModel,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)


In [46]:
response = rag_chain.invoke({"input":"What is Acromegaly and gigantism?"})
print(response["answer"])

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}