In [1]:
import os 
os.chdir(r"C:\Users\gamin\Documents\Medical Chatbot\Medical-Chatbot")

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader)
    
    documents = loader.load()
    return documents

In [4]:
extracted_data = load_pdf_files(r"C:\Users\gamin\Documents\Medical Chatbot\Medical-Chatbot\Data")

In [5]:
from typing import List
from langchain.schema import Document

def filter_to_mini_docs(docs: List[Document]) -> List[Document]:
    mini_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        mini_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return mini_docs

In [6]:
mini_docs = filter_to_mini_docs(extracted_data)

In [7]:
def text_splitter(mini_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len
    )
    texts = text_splitter.split_documents(mini_docs)
    return texts

In [8]:
text_chunk = text_splitter(mini_docs)
print(len(text_chunk))

5859


In [35]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name=model_name,)


In [36]:
vector = embedding.embed_query("Hello world")

In [37]:
vector

[0.015196096152067184,
 -0.02257070504128933,
 0.00854706484824419,
 -0.0741705670952797,
 0.003836418269202113,
 0.0027135638520121574,
 -0.03126787021756172,
 0.04463401436805725,
 0.04405521601438522,
 -0.007871180772781372,
 -0.025200799107551575,
 -0.03336653858423233,
 0.014427904039621353,
 0.04653818905353546,
 0.008555042557418346,
 -0.016145769506692886,
 0.007405790034681559,
 -0.01901242695748806,
 -0.1147262379527092,
 -0.018157614395022392,
 0.12635935842990875,
 0.02970292419195175,
 0.02528098225593567,
 -0.03421787545084953,
 -0.040999673306941986,
 0.006617343053221703,
 0.01027061976492405,
 0.022362234070897102,
 0.004436317831277847,
 -0.12730959057807922,
 -0.016149301081895828,
 -0.02038012631237507,
 0.04721212759613991,
 0.011579934507608414,
 0.0681871622800827,
 0.007298614829778671,
 -0.017852991819381714,
 0.0407821387052536,
 -0.010269471444189548,
 0.023757051676511765,
 0.010602937079966068,
 -0.02858443558216095,
 0.008159700781106949,
 -0.0151805421337

In [12]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [13]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
AI21_API_KEY = os.getenv("AI21_API_KEY")

In [14]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["AI21_API_KEY"] = AI21_API_KEY

In [15]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)

In [16]:
from pinecone import ServerlessSpec
index_name = "medical-chatbot"

if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


In [17]:
index = pc.Index(index_name)


In [18]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunk,
    embedding = embedding,
    index_name = index_name)

In [19]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [21]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [22]:
retrieved_docs = retriever.invoke("What is Acne?")

In [23]:
retrieved_docs

[Document(id='ba464577-499b-4ea6-8659-8df5c8235d32', metadata={'source': 'C:\\Users\\gamin\\Documents\\Medical Chatbot\\Medical-Chatbot\\Data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nthe most common skin disease. It affects nearly 17 million\npeople in the United States. While acne can arise at any'),
 Document(id='4fa1a38c-84b3-4245-84ee-4eaa97080788', metadata={'source': 'C:\\Users\\gamin\\Documents\\Medical Chatbot\\Medical-Chatbot\\Data\\Medical_book.pdf'}, page_content='of the brain. Make sure the physician knows if tetracy-\ncline is being used to treat acne or another infection.\nNancy Ross-Flanigan\nKEY TERMS\nAcne—A skin condition in which rai

In [26]:
from langchain_ai21 import ChatAI21

chatModel = ChatAI21(model="jamba-large")

In [24]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [25]:
system_prompt = (
    "You are a helpful medical assistant. Use the following context to answer the question.\n"
    "If you don't know the answer, just say you don't know. Do not try to make up an answer.\n"
    "Answer Concisely."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("user", "{input}")
])

In [27]:
question_answering_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [31]:
response = rag_chain.invoke({"input" :"What is Acne?"})

In [34]:
print(response["answer"])

Acne is a common skin condition that occurs when hair follicles become clogged with oil, dead skin cells, and bacteria. It often results in pimples, blackheads, or whiteheads, primarily on the face, chest, and back.
