In [19]:
%pip install google-generativeai pypdf2 faiss-cpu sentence-transformers numpy


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp313-cp313-win_amd64.whl.metadata (5.2 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Downloading faiss_cpu-1.12.0-cp313-cp313-win_amd64.whl (18.2 MB)
   ---------------------------------------- 0.0/18.2 MB ? eta -:--:--
    --------------------------------------- 0.3/18.2 MB ? eta -:--:--
   -- ------------------------------------- 1.3/18.2 MB 5.4 MB/s eta 0:00:04
   ----- ---------------------------------- 2.6/18.2 MB 5.5 MB/s eta 0:00:03
   --------

In [None]:
API_KEY = ""
genai.configure(api_key=API_KEY)

In [22]:
import google.generativeai as genai
import PyPDF2
import textwrap
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# ---------- SETUP ----------
#API_KEY = "YOUR_GEMINI_API_KEY"
#genai.configure(api_key=API_KEY)

# Initialize embedding model (using SentenceTransformers as Gemini embedding API isn't directly compatible with FAISS)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_size = 384  # Size of all-MiniLM-L6-v2 embeddings

# Create FAISS index
index = faiss.IndexFlatL2(embedding_size)
document_store = []  # To store the actual text chunks
metadata_store = []  # To store metadata about each chunk

# ---------- INGEST DOCS ----------
def ingest_pdf(pdf_path):
    if not os.path.exists(pdf_path):
        print(f"File {pdf_path} not found!")
        return
        
    reader = PyPDF2.PdfReader(pdf_path)
    documents = []
    metadatas = []
    
    for page_num, page in enumerate(reader.pages):
        text = page.extract_text()
        if text.strip():
            # Split text into chunks while preserving sentence boundaries
            chunks = textwrap.wrap(text, width=1000, break_long_words=False, break_on_hyphens=False)
            for idx, chunk in enumerate(chunks):
                documents.append(chunk)
                metadatas.append({"source": pdf_path, "page": page_num, "chunk": idx})
    
    # Generate embeddings for all documents
    if documents:
        embeddings = embedding_model.encode(documents)
        
        # Add to FAISS index
        global index, document_store, metadata_store
        index.add(np.array(embeddings).astype('float32'))
        document_store.extend(documents)
        metadata_store.extend(metadatas)
        
        print(f"Added {len(documents)} chunks from {pdf_path}")

ingest_pdf("mentalpolicy.pdf")

# Save the FAISS index and document store for later use
faiss.write_index(index, "document_index.faiss")
# In a real application, you'd also save document_store and metadata_store

# ---------- QUERY ----------
def rag_query(query):
    try:
        # Generate embedding for the query
        query_embedding = embedding_model.encode([query])
        
        # Search in FAISS index
        D, I = index.search(np.array(query_embedding).astype('float32'), k=3)
        
        # Get the top documents
        top_docs = [document_store[i] for i in I[0]]
        
        context = "\n".join(top_docs)
        prompt = f"Answer the query based on the following context:\n\nContext:\n{context}\n\nQuery: {query}"
        
        model = genai.GenerativeModel('gemini-2.5-flash')
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error: {str(e)}"

# Test
print(rag_query("When you receive services from City-Hospitalyou have the right to:"))

Added 221 chunks from mentalpolicy.pdf
When you receive services from City-Hospital, you have the right to:
*   Receive high-quality service
*   Be treated with respect and courtesy
*   Have your information kept private and confidential except as described in City-Hospital privacy statement
*   Be listened to and have staff work with you to make a plan to address your concerns and needs
*   Receive service in offices that are safe, clean and accessible
*   Get information and support to help you make decisions to improve your situation
*   Be served without discrimination
*   Discuss your service with staff to identify if it is working for you and express
