In [4]:
import os
# Check current directory
print(f"Current directory: {os.getcwd()}")
# Navigate to Medical-Chatbot directory
os.chdir(r"c:\Users\abmg2\Desktop\python_projects\Medical-Chatbot")
print(f"New directory: {os.getcwd()}")
print(f"Data folder exists: {os.path.exists('./data')}")
if os.path.exists('./data'):
    print(f"Files in data folder: {os.listdir('./data')}")

Current directory: c:\Users\abmg2\Desktop\python_projects\Medical-Chatbot\research
New directory: c:\Users\abmg2\Desktop\python_projects\Medical-Chatbot
Data folder exists: True
Files in data folder: ['Medical_book.pdf']


In [5]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [7]:
import os
print(os.getcwd())


c:\Users\abmg2\Desktop\python_projects\Medical-Chatbot


In [8]:
extracted_data = load_pdf_file(data='./data')


In [19]:
#extracted_data

In [9]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [10]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5859


In [60]:
#text_chunks

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

In [12]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [13]:
#Download the Embeddings from Hugging Face
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [14]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [66]:
# query_result

In [31]:
from dotenv import load_dotenv
load_dotenv()

True

In [32]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [33]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [18]:
index_name = "medbot"
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [24]:
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
index_name = "medbot"
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [26]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x16365fa7350>

In [27]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [29]:
retrieved_docs = retriever.invoke("What is Acne?")

In [30]:
retrieved_docs

[Document(id='bad6906b-8bf3-4179-82e2-6f31312db4ab', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='31f61852-772e-4872-838f-1437559773c1', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM 

In [34]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [35]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [36]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [37]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])



Acromegaly and gigantism are disorders caused by the abnormal release of a chemical from the pituitary gland in the brain, resulting in increased growth in bone and soft tissue, as well as other disturbances throughout the body. This can lead to unusual height and is more commonly diagnosed in middle-aged individuals. The disorder is relatively rare, affecting approximately 50 out of every one million people.


In [38]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])


A complete blood count (CBC) is a series of tests that is often done as part of a routine physical examination and can provide valuable information about the blood and blood-forming tissues. It is used to evaluate the numbers, concentrations, and conditions of different types of blood cells. It is a useful screening and diagnostic test for conditions such as anemias, leukemias, and infections.
