In [None]:
!pip install langchain-community langchain-groq python-dotenv pypdf fastembed faiss-cpu langchain-experimental


Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-groq
  Downloading langchain_groq-0.2.5-py3-none-any.whl.metadata (2.6 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting pypdf
  Downloading pypdf-5.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting fastembed
  Downloading fastembed-0.6.0-py3-none-any.whl.metadata (9.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting langchain-experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain-core<1.0.0,>=0.3.41 (from langchain-community)
  Downloading langchain_core-0.3.43-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.20 (from langchain-community)
  Downloading langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langch

In [None]:
from google.colab import files
uploaded = files.upload()

Saving RP35_Right-to-health-and-medicines_EN.pdf to RP35_Right-to-health-and-medicines_EN.pdf


In [None]:
from langchain_community.document_loaders import PyPDFLoader

pdf_filename = list(uploaded.keys())[0]
loader = PyPDFLoader(f"/content/{pdf_filename}")
data = loader.load()

print(f"Loaded {len(data)} pages of the document.")


Loaded 72 pages of the document.


In [None]:
import gc
from langchain.vectorstores import FAISS
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain.text_splitter import CharacterTextSplitter

# Smaller chunk size for less memory load
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=50)
chunks = text_splitter.split_documents(data)

# Efficient Embedding Model
embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-small-en")

# Incremental Index Creation for Large Datasets
batch_size = 50  # Smaller batches to manage RAM efficiently
db = None

for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]

    if db is None:
        db = FAISS.from_documents(batch, embeddings)
    else:
        db.add_documents(batch)

    # Force garbage collection to free memory
    del batch
    gc.collect()

# Efficient Retriever Setup
retriever = db.as_retriever(search_type='similarity', search_kwargs={'k': 4})

print("✅ FAISS Vector Store Created Successfully with Optimizations")


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model_optimized.onnx:   0%|          | 0.00/133M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

✅ FAISS Vector Store Created Successfully with Optimizations


In [None]:
import os
from langchain_groq import ChatGroq

os.environ["GROQ_API_KEY"] = "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"

llm = ChatGroq(model_name="llama3-70b-8192")


In [None]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever, return_source_documents=True)

query = "What are the common symptoms of diabetes?"
result = qa.invoke(query)
print(result['result'])


This text does not mention diabetes or its symptoms. It appears to be a discussion about access to medicines, intellectual property rights, and public health policies, particularly in developing countries. If you have a question about diabetes, I'd be happy to try and help you find the answer, but it's not related to this text.


In [None]:
from langchain.prompts import PromptTemplate

prompt_template = """
You are an AI healthcare assistant with access to medical documents.
Answer the question only based on the given context. If you don't know, say 'I don't know.'
Provide short, accurate responses.

Context: {context}
Question: {question}"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

qa = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever, return_source_documents=True, chain_type_kwargs={"prompt": prompt})

query = "What are the early symptoms of Alzheimer's disease?"
result = qa.invoke(query)
print(result['result'])


I don't know. The provided context is a research paper on public health, innovation, and intellectual property, and it does not mention Alzheimer's disease or its symptoms.


In [None]:
import pandas as pd

# Convert extracted data to structured format
rows = [page.page_content.split('\n') for page in data]

# Example logic to extract data with potential structure
structured_data = []

for page in rows:
    for line in page:
        if "Blood Pressure" in line or "BP" in line:
            structured_data.append(line.strip().split())

# Create DataFrame
df = pd.DataFrame(structured_data, columns=["Patient ID", "Name", "Blood Pressure", "Other Details"])

# Verify DataFrame
print(df.head())


Empty DataFrame
Columns: [Patient ID, Name, Blood Pressure, Other Details]
Index: []


In [None]:
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent

agent = create_pandas_dataframe_agent(llm, df, verbose=True, allow_dangerous_code=True)

query = "What is the average blood pressure level in the dataset?"
result = agent.invoke(query)
print(result['output'])




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To find the average blood pressure level, I need to access the 'Blood Pressure' column of the dataframe.

Action: python_repl_ast
Action Input: df['Blood Pressure'][0m[36;1m[1;3mSeries([], Name: Blood Pressure, dtype: object)[0m[32;1m[1;3mIt looks like the 'Blood Pressure' column is of object type, which means it's likely a string. We need to convert it to a numeric type to calculate the average.

Action: python_repl_ast
Action Input: df['Blood Pressure'].dtype[0m[36;1m[1;3mobject[0m[32;1m[1;3mLet's continue!

Action: python_repl_ast
Action Input: df['Blood Pressure'].str.contains('[a-zA-Z]').any()[0m[36;1m[1;3mFalse[0m[32;1m[1;3mAction: python_repl_ast
Action Input: df['Blood Pressure'] = df['Blood Pressure'].astype(float)[0m[36;1m[1;3m[0m[32;1m[1;3mAction: python_repl_ast
Action Input: df['Blood Pressure'].mean()[0m[36;1m[1;3mnan[0m[32;1m[1;3mThought: It looks like the mean is returnin

In [None]:
!pip install torch==2.5.1+cu124 --extra-index-url https://download.pytorch.org/whl/cu124
!pip install markupsafe==2.1.1
!pip install gradio


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu124
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1+cu124)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1+cu124)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1+cu124)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
import gradio as gr
import os
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate

# Initialize LLM
os.environ["GROQ_API_KEY"] = "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"
llm = ChatGroq(model_name="llama3-70b-8192")

# Prompt Template for Clear and Accurate Responses
prompt_template = """
You are an AI healthcare assistant with access to medical documents.
Answer the question only based on the given context. If you don't know, say 'I don't know.'
Provide short, accurate responses.

Context: {context}
Question: {question}
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Create the QA Chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever,  # Use the FAISS retriever created earlier
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

# Gradio Chatbot Function
def chat_with_bot(query):
    result = qa.invoke(query)
    return result['result']

# Gradio UI
interface = gr.Interface(
    fn=chat_with_bot,
    inputs="text",
    outputs="text",
    title="🩺 AI Healthcare Assistant",
    description="Ask me anything about healthcare and medical conditions. I'll provide accurate, verified information from medical documents."
)

# Launch the chatbot
interface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://33ba66688144c61479.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


