In [1]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
from langchain.document_loaders import PyPDFLoader
import os

def load_pdf_files_corrected(data_path):
    all_documents = []
    total_pages = 0
    
    # Get all PDF files
    pdf_files = [f for f in os.listdir(data_path) if f.endswith('.pdf')]
    
    for pdf_file in pdf_files:
        print(f"Processing: {pdf_file}")
        loader = PyPDFLoader(os.path.join(data_path, pdf_file))
        documents = loader.load()
        all_documents.extend(documents)
        print(f"Pages in {pdf_file}: {len(documents)}")
        total_pages += len(documents)
    
    print(f"Total pages across all PDFs: {total_pages}")
    return all_documents

# Use this instead
documents = load_pdf_files_corrected('Data/')
print(f"Final document count: {len(documents)}")

Processing: The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.pdf
Pages in The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.pdf: 4505
Total pages across all PDFs: 4505
Final document count: 4505


In [4]:
print(len(documents))

4505


In [9]:
def splitter(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(text)
    return chunks

text = splitter(documents)
len(text)

23444

In [20]:
import os 
from dotenv import load_dotenv
from pinecone import Pinecone as PineconeClient, ServerlessSpec

load_dotenv()
api_key = os.getenv("PINECONE_API_KEY")

index_name = "medibot"

pc = PineconeClient(api_key=api_key)

pc.create_index(
    name=index_name,
    dimension=384,  
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

index = pc.Index(index_name)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-04', 'x-cloud-trace-context': '5aae4bae7c913b28b8fa9745cfe6a6d1', 'date': 'Sun, 24 Aug 2025 09:09:06 GMT', 'server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


### Step 1.3 & 1.4 Embedding and Vector Store

In [16]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model='sentence-transformers/all-MiniLM-L6-v2')

In [23]:
from langchain_pinecone import PineconeVectorStore
vector_store = PineconeVectorStore.from_documents(
    documents=text,
    embedding=embeddings,
    index_name=index_name
)

## Step 2 Retrieval

In [24]:
retriever = vector_store.as_retriever(search_type='similarity', search_kawgs={'k' : 4})

In [25]:
retriever.invoke('Headaches')

[Document(id='a9e9f4d0-e84b-41b4-9864-b53caa4d4b91', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 940.0, 'page_label': '911', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data/The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.pdf', 'total_pages': 4505.0}, page_content='Migraine headache— An intense throbbing pain that\noccurs on one or both sides of the head. The head-\nache is usually accompanied by other symptoms,\nsuch as nausea, vomiting, and aversion to light.\nProphylactic— Referring to treatment that prevents\nsymptoms from occurring.\nTension-type headache— A dull pain that seems to\nexert pressure on the head; the most common form\nof headache.\nGALE ENCYCLOPEDIA OF MEDICINE 911\nCoagulation disorders'),
 Document(id='a8ab5215-dd9d-4c63-92f2-2b66b90e8653', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:

## Step 3 Augmentation

In [28]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    template = '''You are a helpful assistant. Respond to the user queries only using the following context provided,
    context: {context}, query: {query}
    If there is not specific answer then respond I dont know.
    ''',
    input_variables=['context','query']
)

query = 'What causes cardiac arrest?'
relevant_docs = retriever.invoke(query)


In [30]:
context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
context_text

'University of Washington School of Medicine. <http://\ndepts.washington.edu/learncpr/>.\nL. Fleming Fallon, Jr., MD, DrPH\nKEY TERMS\nCardiac arrest— Temporary or permanent cessa-\ntion of the heartbeat.\nCardiopulmonary— Relating to the heart and the\nlungs.\nDefibrillation— A procedure to stop the type of\nirregular heart beat called ventricular fibrillation,\nusually by using electric shock.\nResuscitation— Bringing a person back to life after\nan apparent death or in cases of impending death.\nVentricular fibrillation— An irregular heartbeat\nwhere the heart beats very fast but ineffectively.\nVentricular fibrillation is fatal if not quickly\ncorrected.\nGALE ENCYCLOPEDIA OF MEDICINE 741\nCardiopulmonary resuscitation (CPR)\n\nby itself, so the condition requires immediate interven-\ntion. Ventricular tachycardiacan also lead to sudden\ncardiac death. The risk for SCD is higher for anyone\nwith heart disease.\nWhen the heart stops beating effectively and the\nbrain is being depriv

In [31]:
template = prompt.invoke({'context': context_text, 'query': query})

## Step 4 Generation

In [47]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
load_dotenv()
llm = HuggingFaceEndpoint(
    repo_id='openai/gpt-oss-20b',
    task='text-generation',
    huggingfacehub_api_token=os.getenv("HUGGINGFACE_ACCESS_TOKEN")
)

model = ChatHuggingFace(llm=llm)

In [48]:
result = model.invoke(template)

print(result.content)

Cardiac arrest occurs when the heart suddenly stops beating effectively enough to deliver blood to the brain.  
The most common causes cited in the context are:

- **Severe arrhythmias** – especially ventricular fibrillation or ventricular tachycardia, which can abruptly interrupt the heart’s rhythm.  
- **Heart disease** – conditions that weaken or damage the heart muscle (e.g., coronary artery disease, a heart attack, myocarditis, or other structural problems).  
- **Heart‑attack‑related** changes – the electrical disturbances that arise during and after a myocardial infarction.  

These factors can lead to the temporary or permanent cessation of the heartbeat, i.e., cardiac arrest.


## Making chain

In [50]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [61]:
def format_docs(retrieved_docs):
  context = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context

In [62]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'query' : RunnablePassthrough()
})

In [63]:
parser = StrOutputParser()

main_chain = parallel_chain | prompt | model | parser

In [64]:
main_chain.invoke('What causes cardiac arrest?')

'Cardiac arrest is caused when the heart’s normal rhythm stops working effectively.  In the information provided, the main causes highlighted are:\n\n* **Arrhythmias** – abnormal heart rhythms, especially ventricular fibrillation or ventricular tachycardia, that make the heart beat too fast, too irregularly, or cease to pump blood.\n* **Sudden loss of effective heart beat** – often due to a critical arrhythmia, especially in people with underlying heart disease.\n\nThus, the most common trigger for cardiac arrest mentioned here is a severe arrhythmia, such as ventricular fibrillation.'