In [47]:
pip install -U pinecone


Collecting pinecone
  Downloading pinecone-8.0.0-py3-none-any.whl.metadata (11 kB)
Collecting pinecone-plugin-assistant<4.0.0,>=3.0.1 (from pinecone)
  Downloading pinecone_plugin_assistant-3.0.2-py3-none-any.whl.metadata (30 kB)
Downloading pinecone-8.0.0-py3-none-any.whl (745 kB)
   ---------------------------------------- 0.0/745.9 kB ? eta -:--:--
   ---------------------------- ----------- 524.3/745.9 kB 2.4 MB/s eta 0:00:01
   ---------------------------------------- 745.9/745.9 kB 1.7 MB/s  0:00:00
Downloading pinecone_plugin_assistant-3.0.2-py3-none-any.whl (280 kB)
Installing collected packages: pinecone-plugin-assistant, pinecone

  Attempting uninstall: pinecone-plugin-assistant

    Found existing installation: pinecone-plugin-assistant 1.8.0

    Uninstalling pinecone-plugin-assistant-1.8.0:

      Successfully uninstalled pinecone-plugin-assistant-1.8.0

   ---------------------------------------- 0/2 [pinecone-plugin-assistant]
   ----------------------------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-pinecone 0.2.8 requires langchain-core<1.0.0,>=0.3.34, but you have langchain-core 1.2.8 which is incompatible.
langchain-pinecone 0.2.8 requires pinecone[asyncio]<8.0.0,>=6.0.0, but you have pinecone 8.0.0 which is incompatible.


In [8]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [9]:
# extract text from PDF

def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [10]:
extracted_data = load_pdf_files("../data")

In [11]:
len(extracted_data)

637

In [12]:
from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    given a list of Document objects, return a new list of Document objects
    containing only "source" in metadata and the origunal page_content.
    """
    minimal_docs : List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
           
        )
       
    return minimal_docs

In [13]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [14]:
len(minimal_docs)

637

In [15]:
# split the text into smaller chunks
def text_split(minimal_docs): 
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        
    )
    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

In [16]:
text_chunks = text_split(minimal_docs)
print(f"Number of text chunks: {len(text_chunks)}")

Number of text chunks: 5859


In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return HuggingFace BGE embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name = model_name)
    
    return embeddings

embedding = download_embeddings()

In [18]:
embedding

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [19]:
vector = embedding.embed_query("This is a test embedding")
vector

[-3.9838621887611225e-05,
 -0.008389574475586414,
 0.017159003764390945,
 -0.009672391228377819,
 0.053906578570604324,
 0.03168688341975212,
 -0.012321753427386284,
 -0.03883596882224083,
 -0.0013938836054876447,
 -0.03625251725316048,
 0.02205701358616352,
 -0.0024881670251488686,
 0.04577365145087242,
 0.05786357820034027,
 -0.10501985251903534,
 0.001572072971612215,
 0.07229086011648178,
 0.011807010509073734,
 -0.04792192578315735,
 -0.001890188897959888,
 -0.04576699435710907,
 0.026637807488441467,
 0.04789052531123161,
 -0.052035700529813766,
 0.05804198235273361,
 -0.010897460393607616,
 -0.03865080326795578,
 0.07309256494045258,
 0.08525508642196655,
 -0.06628170609474182,
 0.11747560650110245,
 -0.018773114308714867,
 -0.0013456125743687153,
 0.07078934460878372,
 0.03344900533556938,
 0.008096154779195786,
 0.02106359228491783,
 0.03111531399190426,
 -0.02520374022424221,
 0.04926156625151634,
 0.022032734006643295,
 -0.04765099659562111,
 0.054290689527988434,
 0.0441928

In [20]:
print(len(vector))

384


In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

In [4]:
from pinecone import Pinecone

pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)


In [5]:
pc

<pinecone.pinecone.Pinecone at 0x21976cbff40>

In [6]:
from pinecone import ServerlessSpec

index_name = "medi-assistant"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(index_name)

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embedding,    
    index_name=index_name
)
    
    

In [22]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [23]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='06755227-8fb6-4756-b4fd-c0d652a63e78', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='1c07bf29-be4d-4338-9638-2bbd22e69c92', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a womanâ€™s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='95282fc8-6ea6-43f5-8600-c69281f83f27', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin bec

In [15]:
pip install groq

Collecting groq
  Downloading groq-1.0.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-1.0.0-py3-none-any.whl (138 kB)
Installing collected packages: groq
Successfully installed groq-1.0.0
Note: you may need to restart the kernel to use updated packages.


In [24]:
from groq import Groq
from dotenv import load_dotenv
import os
load_dotenv()

client = Groq(api_key=os.getenv("GROQ_API_KEY"))

response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": "Say hello like a doctor."}
    ]
)

print(response.choices[0].message.content)

Hello, and welcome to our medical facility. I'm glad you're here. Now, let's get started with your examination, shall we?


In [25]:
def ask_medical_bot(question: str) -> str:
    docs = retriever.invoke(question)
    
    if not docs:
        return "I could not find relevant information in the provided documents."

    
    context = "\n\n".join([doc.page_content for doc in docs])
    
    prompt = f"""
    You are a medical expert AI assistant. 
    Use ONLY the context below to answer the question.
    If the answer is not present, say "I don't know".
    
    Context: {context}
    
    Question: {question}
    
    Answer: 
    (This is for educational purpose only. Consult a doctor.)
    """
    
    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
        max_tokens=400
    )
    
    return response.choices[0].message.content

In [27]:
print(ask_medical_bot("What is acne?"))

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria.
