In [1]:
%pwd

'e:\\Project\\Medical_Chatbot\\research'

In [2]:
import os 
os.chdir("../")

In [3]:
%pwd

'e:\\Project\\Medical_Chatbot'

In [5]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [6]:
# extract text from pdf files
def load_pdf_files(source_data):
    loader = DirectoryLoader(
        source_data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [10]:
extracted_data = load_pdf_files("source_data")

In [11]:
extracted_data

[Document(metadata={'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2006-10-16T22:03:45+02:00', 'source': 'source_data\\The GALE encyclopedia of medicine.pdf', 'total_pages': 4505, 'page': 0, 'page_label': 'i'}, page_content=''),
 Document(metadata={'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2006-10-16T22:03:45+02:00', 'source': 'source_data\\The GALE encyclopedia of medicine.pdf', 'total_pages': 4505, 'page': 1, 'page_label': 'ii'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION'),
 Document(metadata={'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'creator': 'Adobe Acrobat 6.0', 'creationdate': '2006-10-16T20:19:33+02:00', 'moddate': '2006-10-16T22:03:45+02:00', 'source': 'source_data\\The GALE encyclopedia of medicine.pdf', 'total_pages': 4505, 'page': 2, 'page_label': 'iii-1'}, page_content='The GALE\nENCYCL

In [12]:
len(extracted_data)

4505

In [17]:
from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """

    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs


In [18]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [19]:
minimal_docs[:10]

[Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content=''),
 Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION'),
 Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n1\nA-B\nJACQUELINE L. LONGE, PROJECT EDITOR'),
 Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n2\nC-F\nJACQUELINE L. LONGE, PROJECT EDITOR'),
 Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n3\nG-M\nJACQUELINE L. LONGE, PROJECT EDITOR'),
 Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='The GALE\

In [20]:
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    text_cunks = text_splitter.split_documents(minimal_docs)
    return text_cunks

In [22]:
text_chunk = text_split(minimal_docs)
print(f"number of chunks: {len(text_chunk)}")

number of chunks: 40000


In [23]:
text_chunk

[Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION'),
 Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n1\nA-B\nJACQUELINE L. LONGE, PROJECT EDITOR'),
 Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n2\nC-F\nJACQUELINE L. LONGE, PROJECT EDITOR'),
 Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n3\nG-M\nJACQUELINE L. LONGE, PROJECT EDITOR'),
 Document(metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION\nVOLUME\n\x81\n4\nN-S\nJACQUELINE L. LONGE, PROJECT EDITOR'),

In [25]:
from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """

    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [26]:
embedding

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [27]:
vector = embedding.embed_query("hello world")
vector

[-0.03447727486491203,
 0.03102317824959755,
 0.006734970025718212,
 0.026108985766768456,
 -0.03936202451586723,
 -0.16030244529247284,
 0.06692401319742203,
 -0.006441489793360233,
 -0.0474504791200161,
 0.014758856035768986,
 0.07087527960538864,
 0.05552763119339943,
 0.019193334504961967,
 -0.026251312345266342,
 -0.01010954286903143,
 -0.02694045566022396,
 0.022307461127638817,
 -0.022226648405194283,
 -0.14969263970851898,
 -0.017493007704615593,
 0.00767625542357564,
 0.05435224249958992,
 0.0032543970737606287,
 0.031725890934467316,
 -0.0846213847398758,
 -0.02940601296722889,
 0.05159561336040497,
 0.04812406003475189,
 -0.0033148222137242556,
 -0.058279167860746384,
 0.04196927323937416,
 0.022210685536265373,
 0.1281888335943222,
 -0.022338971495628357,
 -0.011656315997242928,
 0.06292839348316193,
 -0.032876335084438324,
 -0.09122604131698608,
 -0.031175347045063972,
 0.0526994913816452,
 0.04703482985496521,
 -0.08420311659574509,
 -0.030056199058890343,
 -0.02074483036

In [28]:
print("vector length: ", len(vector))

vector length:  384


In [29]:
from dotenv import load_dotenv
import os
load_dotenv()


True

In [30]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["MISTRAL_API_KEY"] = MISTRAL_API_KEY

In [31]:
from pinecone import Pinecone

pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [32]:
pc

<pinecone.pinecone.Pinecone at 0x15c6d7c81d0>

In [33]:
from pinecone import ServerlessSpec 

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [None]:
# load to pinecone

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunk,
    embedding=embedding,
    index_name=index_name
)

In [36]:
# loading from pinecone
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [37]:
# retriever
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [38]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='6c239d32-1e4d-4a3c-8093-55e10caa48f4', metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='7784f1da-1342-4081-af64-f4fedf3bf989', metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='Sebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSebum— An oily skin moisturizer produced by\nsebaceous glands.\nTretinoin— A drug that works by increasing the\nturnover (death and replacement) of skin cells.\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous glands\nbecome inflamed. (Photograph by Biophoto Associates, Photo'),
 Document(id='c9104c77-cd40-44ff-a44f-872f31823d72', metadata={'source': 'source_data\\The GALE encyclopedia of medicine.pdf'}, page_content='Pathol

In [39]:
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate, AIMessagePromptTemplate

In [40]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [45]:
from langchain_mistralai import ChatMistralAI

In [46]:
def setup_llm():
    llm = ChatMistralAI(
        model_name="mistral-small-latest"
    )
    return llm

In [43]:
def setup_prompt():
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}")
        ]  
    )
    return prompt

In [44]:
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda

In [47]:
chain = (
    RunnableParallel(
        context = lambda x: retriever.invoke(x['input']),
        input = RunnablePassthrough()
    )
    | setup_prompt()
    | setup_llm()
    | RunnableLambda(lambda msg: {"result": msg.content})
)



In [48]:
response = chain.invoke({"input": "what is heart attack?"})

In [49]:
print(response)

{'result': 'A heart attack, also known as a myocardial infarction (MI), occurs when the blood supply to part of the heart muscle is severely reduced or stopped, leading to the death or damage of heart tissue. This is typically caused by a blockage in one or more coronary arteries, often due to atherosclerosis or a blood clot. Heart attacks are a leading cause of death and can be the first sign of underlying coronary artery disease.'}
