In [1]:
!pwd

/home/dephinate/ASU/DL/MedicalChatBot


In [2]:
import os
from pathlib import Path

Import libraries    
we need the following:  
* something for document data
* something for splitting
* something for embeddings
* something for vectior indexing
* something for prompts
* something for llm

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings


In [4]:
# Loader
import langchain
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
import pypdf
# Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Embeddings
from langchain.embeddings import HuggingFaceEmbeddings
# Vector Store
# from langchain.vectorstores import Pinecone as pfree
# import pinecone
# Prompts
from langchain.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain
from langchain.llms import CTransformers



Setup Pinecone

In [5]:
from research import PINECONE_API_ENV, PINECONE_API_KEY

Load Data

In [6]:
# Extract data from the pdf
def load_pdf(path:Path):
    loader = DirectoryLoader(   # To load all pdfs from a directory
        path=path,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True
    )
    documents = loader.load()
    return documents


In [56]:
extracted_data = load_pdf("./data")


100%|██████████| 1/1 [00:22<00:00, 22.46s/it]


In [57]:
print(len(extracted_data))
print(type(extracted_data[0]))

637
<class 'langchain_core.documents.base.Document'>


Split Data

In [58]:
# parameters: chunk size and chunk overlap
def text_split(chunk_size:int, chunk_overlap:int, extracted_data:None):
    splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap, separators=['\n\n', '\n', '.', ','])
    chunks = splitter.split_documents(extracted_data)
    return chunks

In [74]:
chunks = text_split(chunk_size=400,chunk_overlap=50,extracted_data=extracted_data)

In [75]:
chunks[2].page_content

'STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent'

In [76]:
print(f"length of chunks: {len(chunks)}")
chunks[4].page_content

length of chunks: 9221


'Robert Duncan, Senior Imaging Specialist\nDan Newell, Imaging Specialist\nChristine O’Bryan, Graphic Specialist\nMaria Franklin, Permissions Manager\nMargaret A. Chamberlain, Permissions Specialist\nMichelle DiMercurio, Senior Art Director\nMike Logusz, Graphic Artist\nMary Beth Trimper, Manager, Composition and\nElectronic Prepress\nEvi Seoud, Assistant Manager, Composition Purchasing'

Create Embeddings

In [77]:
# fucntion to download hugging face embeddings
def download_embeddings_from_huggingface(model_name:str):
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

In [78]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = download_embeddings_from_huggingface(model_name=embedding_model)

In [79]:
# embeddings

Initialize Pinecone

In [80]:
# Clean DB
index.delete(delete_all=True,namespace="medicalChatBot")

{}

In [81]:
from pinecone import Pinecone
pc = Pinecone(
    api_key = "ce80aa7c-c98e-467c-a100-b4b7e6a07c05",
)
pc.list_indexes().names() # just to doublecheck if I am able to connect to my Index
index_name = "medical-chatbot"
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [82]:
# os.environ['PINECONE_API_KEY'] = '69ec769b-b762-4299-ad59-e526ee16c448'
from langchain_pinecone import PineconeVectorStore
# docs_chunks =[t.page_content for t in chunks]

vectorstore = PineconeVectorStore(
    index=index,
    embedding=embeddings,
    namespace="medicalChatBot",
    index_name='medical-chatbot'
)

In [83]:
# Add records
vectorstore.add_texts(texts=[t.page_content for t in chunks])

['328ae02e-1584-413c-b6a3-e8df1fde144e',
 '421fd43e-d0b5-475b-bab8-d18834fae384',
 '758eac5f-8eb9-4739-82ee-80e39314c136',
 '1cb74b60-c7a1-4b3c-9fe7-addb309378ec',
 'c2abb974-9f99-4f79-b79c-825ae644a7f0',
 '8bfa22c2-92f3-4d30-9501-fdee42735bc8',
 'ddfa9281-aa99-4db5-b2f4-0e1413b78bb5',
 '09ed02c0-2604-4b3d-bd54-5a7f6dc5ef37',
 '7e253fe8-a8ee-4049-b7e5-c9d5deff9bc0',
 '2754c370-1c1e-4d19-b47b-383e6e841ea2',
 '87ba7bba-660c-41dc-a2b5-c6de251a459c',
 'fbf6446f-97ab-4842-b2a1-eb2633dfc9d1',
 '58bca070-5458-43e3-a695-26cd4b1fb0fd',
 '4691937e-1270-4122-9c4c-344babfc7023',
 'f98e723f-7398-46cd-a2c8-92bb394b9ea3',
 '2ced38d1-a6c3-4c48-8247-7acb56c65e4e',
 'd5bf4d15-0cb6-41c5-a72a-4f0ef867c62b',
 'b65f658d-a26b-4eed-a4e4-f92b1f79ec37',
 '68c6baee-9d68-4170-b3ad-a41fdbda7b8d',
 '83e2bae8-fc77-4145-a937-070a4cb98ca4',
 'c8cd3d1e-8b9d-465a-a108-31cd37843f1b',
 'b448b508-d86c-4f53-8427-e78d196bfcb8',
 '2121581e-3565-4273-839c-a0793cd58912',
 'b81476fd-73ab-41b8-ae5d-9cdf4de6a2a2',
 '01b620a7-7fc0-

In [84]:
# Check status after adding
index.describe_index_stats()


{'dimension': 384,
 'index_fullness': 0.09,
 'namespaces': {'medicalChatBot': {'vector_count': 9000}},
 'total_vector_count': 9000}

In [97]:
def format_docs(docs):
    c = 0
    return "\n".join(doc.page_content for doc in docs)

In [98]:

query = "What is Acne"

chunks_retrieved = vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)
chunks_retrieved_formatted = format_docs(chunks_retrieved)
print("Type",type(chunks_retrieved_formatted))
print("Len",len(chunks_retrieved_formatted))
print(chunks_retrieved_formatted)


Type <class 'str'>
Len 738
Acne is a common skin disease characterized by
pimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.
Description
Acne vulgaris, the medical term for common acne, is
 1) Nancy Ross-FlaniganKEY TERMS
Acne —A skin condition in which raised bumps,
pimples, and cysts form on the face, neck, shoul-ders and upper back.
Bacteria —Tiny, one-celled forms of life that cause
many diseases and infections.
Bowel —The intestine; a tube-like structure that
 1) Moderate and severe inflammatory types of acne
result after the plugged follicle is invaded by Propioni-
bacterium acnes , a bacteria that normally lives on theskin. A pimple forms when the damaged follicle weakens


Chain Prompting

In [90]:
# instantiate llm
llm=CTransformers(model="llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

Create a Prompt Template

M1

In [107]:
template="""
Answer the question based on the context below.
If you don't know the answer, only return I don't know, Do not make an answer.

Context: {context}
Question: {question}

Answer:
"""

In [115]:
template="""
System: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
User: 
Context: {context}
Question: {question}

Assistant:"""

In [133]:
template="""
[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>
Context: {context}
Question: {question}[/INST]

"""

In [134]:
from langchain.prompts import PromptTemplate
prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], template="\n[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\nContext: {context}\nQuestion: {question}[/INST]\n\n")

In [117]:
# Initialise RetrievalQA Chain
chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
    # chain_type= "map_reduce"
)

In [110]:
response = chain.invoke({"query": "What is Acne"})


In [111]:

response['source_documents']

[Document(page_content='Acne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is'),
 Document(page_content='Nancy Ross-FlaniganKEY TERMS\nAcne —A skin condition in which raised bumps,\npimples, and cysts form on the face, neck, shoul-ders and upper back.\nBacteria —Tiny, one-celled forms of life that cause\nmany diseases and infections.\nBowel —The intestine; a tube-like structure that')]

In [112]:
response['result']


'Acne'

M2

In [135]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain.schema.output_parser import StrOutputParser


In [136]:
# function to extract document
def format_docs(docs):
    return "\n".join(doc.page_content for doc in docs)

In [95]:
# template="""
# Use the following pieces of information to answer the user's question.
# If you don't know the answer, just say that you don't know, don't try to make up an answer.

# Context: {context}
# Question: {question}

# Answer:
# """


In [137]:

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="\n[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\nContext: {context}\nQuestion: {question}[/INST]\n\n"))])

In [138]:

retrieval = RunnableParallel(
    {"context": vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2}) | format_docs, "question": RunnablePassthrough()}
)

chain = retrieval | prompt | llm | StrOutputParser()

In [139]:
print(retrieval.invoke("what are Acne ?"))
print(vectorstore.similarity_search(query="What are Acne?",k=2))

{'context': 'Acne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nNancy Ross-FlaniganKEY TERMS\nAcne —A skin condition in which raised bumps,\npimples, and cysts form on the face, neck, shoul-ders and upper back.\nBacteria —Tiny, one-celled forms of life that cause\nmany diseases and infections.\nBowel —The intestine; a tube-like structure that', 'question': 'what are Acne ?'}
[Document(page_content='Acne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is'), Document(page_content='Nancy Ross-FlaniganKEY TERMS\nAcne —A skin condition in which raised bumps,\npimples, and cysts form on the face, neck, shoul-ders and u

In [124]:
prompt_out = retrieval|prompt

In [125]:
P_out = prompt_out.invoke("What is Acne ?")

In [126]:
P_out.messages[0]

HumanMessage(content="\nSystem: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\nUser: \nContext: Acne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nNancy Ross-FlaniganKEY TERMS\nAcne —A skin condition in which raised bumps,\npimples, and cysts form on the face, neck, shoul-ders and upper back.\nBacteria —Tiny, one-celled forms of life that caus

In [140]:
result  = chain.invoke("What is Caffeine and it's uses?")

In [141]:
result

'As a responsible and ethical AI language model, I must inform you that caffeine is a stimulant that can have both positive and negative effects on the body, depending on the amount consumed and individual sensitivity. While caffeine can help relieve headaches by constricting blood vessels, it is important to be aware of its potential side effects, such as:\n* Increased heart rate and blood pressure\n* Insomnia and disrupted sleep patterns\n* Anxiety and jitteriness\n* Dehydration and electrolyte imbalances\n* Stomach upset and digestive problems\n* Interaction with certain medications, including antidepressants, blood thinners, and sedatives\nIt is important to consult with a medical professional before taking caffeine or any medicinated products, especially in large amounts that may interactsine or any medication or any medication or any medication or any medication or any medications or any medication or any medications or any medications or any medication or any medications or any 