In [None]:
# Change directory to your project root if needed
import os
os.chdir("c:\\Users\\Anii\\Medical-Chatbot")
%pwd

In [None]:
# 1. Load PDF documents
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf_file(data='Data/')
print(f"Loaded {len(extracted_data)} documents")

In [None]:
# 2. Split into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

def text_split(extracted_data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    chunks = splitter.split_documents(extracted_data)
    return chunks

text_chunks = text_split(extracted_data)
print(f"Created {len(text_chunks)} chunks")

In [None]:
# 3. Load Hugging Face embeddings (free & local)
from langchain.embeddings import HuggingFaceEmbeddings

def download_hugging_face_embeddings():
    return HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

embeddings = download_hugging_face_embeddings()

query_embedding = embeddings.embed_query("What is the purpose of the study?")
print(f"Embedding vector length: {len(query_embedding)}")

In [None]:
# 4. Load environment variables
from dotenv import load_dotenv
load_dotenv()

import pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT", "us-east1-gcp")

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
index_name = "medical-chatbot"


In [None]:
# 5. Pinecone index (commented out creation and upload, as index exists)
# pc = pinecone.Index(index_name)
# Uncomment to create or upload if needed
# if index_name not in pinecone.list_indexes():
#     pinecone.create_index(name=index_name, dimension=384, metric="cosine")
# docsearch = PineconeVectorStore.from_documents(
#     documents=text_chunks,
#     index_name=index_name,
#     embedding=embeddings
# )

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})


In [None]:
# 6. Setup Google Gemini (PaLM API) LLM for local testing
import google.generativeai as genai
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

from langchain.llms.base import LLM
from typing import Optional, List

class GeminiLLM(LLM):
    def __init__(self, model: str = "models/chat-bison-001", temperature: float = 0.4, max_tokens: int = 512):
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens

    @property
    def _llm_type(self) -> str:
        return "gemini"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = genai.chat.create(
            model=self.model,
            messages=[{"author": "user", "content": prompt}],
            temperature=self.temperature,
            max_output_tokens=self.max_tokens,
        )
        return response.text

llm = GeminiLLM()

In [None]:
# 7. Create RAG chain with prompt
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, just say that you don't know, don't make up an answer. "
    "Answer concisely and accurately based on the context provided.\n\n{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [None]:
# 8. Test a question
response = rag_chain.invoke({"input": "What is Acne?"})
print("Answer:", response.get("answer") or response.get("output_text"))