## End to End medical chatbot

In [2]:
#path
%pwd

'd:\\Projects\\WorkSpace\\GenAI-medical-Chatbot\\research'

In [8]:
import os
os.chdir('../')

In [4]:
%pwd

'd:\\Projects\\WorkSpace\\GenAI-medical-Chatbot'

In [2]:
# Libraries importing

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter ## Chunks operation
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()

True

In [4]:
### Function to load the PDF File
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [9]:
document = load_pdf_file('data/')

In [10]:
document[:50]

[Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page': 1}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION'),
 Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page': 2}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n1\nA-B\nJACQUELINE L. LONGE, PROJECT EDITOR\n'),
 Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page': 3}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n2\nC-F\nJACQUELINE L. LONGE, PROJECT EDITOR\n'),
 Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page': 4}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n3\nG-M\nJACQUELINE L. LONGE, PROJECT EDITOR\n'),
 Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page

In [11]:
## Chunking operation 
def text_splitter(data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_splitted = text_splitter.split_documents(data)
    return text_splitted

In [12]:
data = text_splitter(document)
data[:20]

[Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page': 1}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION'),
 Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page': 2}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n1\nA-B\nJACQUELINE L. LONGE, PROJECT EDITOR'),
 Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page': 3}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n2\nC-F\nJACQUELINE L. LONGE, PROJECT EDITOR'),
 Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page': 4}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n3\nG-M\nJACQUELINE L. LONGE, PROJECT EDITOR'),
 Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine.pdf', 'page': 5}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n4\nN-S\nJACQUELINE L. LONG

In [13]:
len(data) # length of chunk

39994

In [14]:
## Embedding operations
def embedding():
    embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embedding

In [15]:
embedding = embedding()

  embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [16]:
### Test for the embedding model
embed = embedding.embed_query("Hello World")
print("Embedding length:", len(embed))

Embedding length: 384


## Pinecone Setup

In [None]:
from langchain_pinecone import Pinecone

index_name = 'chatbotwebsite'
vectorstore = Pinecone.from_documents(
    documents=data,
    index_name = index_name,
    embedding=embedding
)

In [19]:
## Loading of the existing index
from langchain_pinecone import Pinecone
index_name = 'chatbotwebsite'

docsearch = Pinecone.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [20]:
docsearch

<langchain_pinecone.vectorstores.Pinecone at 0x235d0c96690>

In [21]:
retriever = docsearch.as_retriever(search_type='similarity', search_kwargs={"k":3})

## LLM setup: Google gemini 1.5 pro

In [22]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",temperature=0.3, max_tokens=500)

## Chain and prompt setup

In [23]:
system_prompt = (
    'You are a medical assistant for question-answering tasks'
    'Use the following piece of retrieved context to answer'
    'the question. If you dont know the answer, say that you'
    'dont know. Use three sentences maximun and keep the the answer concise. \n\n {context} '
)

In [24]:
prompt = ChatPromptTemplate(
    [
        ('system', system_prompt),
        ('human', "{input}")
    ]
)

In [25]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [26]:
response = rag_chain.invoke({"input": "What is stats ?"})

print(response['answer'])

The provided text focuses on the Stanford-Binet intelligence scale and its scoring.  It explains that the mean score is 100, with a standard deviation of 16, indicating how far a score deviates from the average.  A standard deviation of 16 means that a score of 116 is one standard deviation above the mean.

