In [1]:
pip install google-generativeai faiss-cpu sentence-transformers PyPDF2 python-docx 


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp312-cp312-win_amd64.whl.metadata (5.1 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading faiss_cpu-1.11.0.post1-cp312-cp312-win_amd64.whl (14.9 MB)
   ---------------------------------------- 0.0/14.9 MB ? eta -:--:--
   ------------ --------------------------- 4.7/14.9 MB 28.6 MB/s eta 0:00:01
   ----------------------- ---------------- 8.9/14.9 MB 26.4 MB/s eta 0:00:01
   ---------------------------------------  14.7/14.9 MB 25.6 MB/s eta 0:00:01
   ---------------------------------------- 14.9/14.9 MB 24.0 MB/s eta 0:00:00
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2, faiss-cpu, sentence-transformers
Successfully installed PyPDF2-3.0.1 faiss-cpu-1.11.0.post1 sente



In [5]:
pip install tf-keras

Collecting tf-kerasNote: you may need to restart the kernel to use updated packages.

  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 1.7/1.7 MB 18.8 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.19.0




In [1]:
import os
import faiss
import numpy as np
from typing import List, Dict, Any
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import PyPDF2
from docx import Document
import re
import pickle

class DocumentProcessor:
    """Handles PDF and DOCX document processing"""
    
    @staticmethod
    def extract_text_from_pdf(file_path: str) -> str:
        """Extract text from PDF file"""
        text = ""
        try:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
        except Exception as e:
            print(f"Error reading PDF: {e}")
        return text
    
    @staticmethod
    def extract_text_from_docx(file_path: str) -> str:
        """Extract text from DOCX file"""
        text = ""
        try:
            doc = Document(file_path)
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
        except Exception as e:
            print(f"Error reading DOCX: {e}")
        return text
    
    @staticmethod
    def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
        """Split text into overlapping chunks"""
        # Clean the text
        text = re.sub(r'\s+', ' ', text.strip())
        
        chunks = []
        start = 0
        
        while start < len(text):
            end = start + chunk_size
            
            if end < len(text):
                sentence_ends = ['.', '!', '?']
                for i in range(end, max(start + chunk_size // 2, end - 100), -1):
                    if text[i] in sentence_ends and i + 1 < len(text) and text[i + 1] == ' ':
                        end = i + 1
                        break
            
            chunk = text[start:end].strip()
            if chunk:
                chunks.append(chunk)
            
            start = end - overlap
            
        return chunks

class RAGSystem:
    """Main RAG system class"""
    
    def __init__(self, gemini_api_key: str, embedding_model: str = "all-MiniLM-L6-v2"):
        """
        Initialize RAG system
        
        Args:
            gemini_api_key: Google Gemini API key
            embedding_model: Sentence transformer model for embeddings
        """
        genai.configure(api_key=gemini_api_key)
        self.gemini_model = genai.GenerativeModel('gemini-2.0-flash')
        
        self.embedding_model = SentenceTransformer(embedding_model)
        self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
        
        self.index = faiss.IndexFlatIP(self.embedding_dim)  # Inner product for cosine similarity
        
        self.documents = []
        self.metadata = []
        
        self.doc_processor = DocumentProcessor()
        
    def add_document(self, file_path: str) -> None:
        """
        Add a document to the knowledge base
        
        Args:
            file_path: Path to PDF or DOCX file
        """
        file_ext = file_path.lower().split('.')[-1]
        
        # Extract text based on file type
        if file_ext == 'pdf':
            text = self.doc_processor.extract_text_from_pdf(file_path)
        elif file_ext == 'docx':
            text = self.doc_processor.extract_text_from_docx(file_path)
        else:
            raise ValueError(f"Unsupported file type: {file_ext}")
        
        if not text.strip():
            raise ValueError(f"No text extracted from {file_path}")
        
        # Chunk the text
        chunks = self.doc_processor.chunk_text(text)
        
        # Generate embeddings
        embeddings = self.embedding_model.encode(chunks)
        
        # Normalize embeddings for cosine similarity
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
        
        # Add to FAISS index
        self.index.add(embeddings.astype('float32'))
        
        # Store documents and metadata
        for i, chunk in enumerate(chunks):
            self.documents.append(chunk)
            self.metadata.append({
                'source': file_path,
                'chunk_id': len(self.documents),
                'text': chunk
            })
        
        print(f"Added {len(chunks)} chunks from {file_path}")
    
    def search_similar_documents(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
        """
        Search for similar documents using vector similarity
        
        Args:
            query: Search query
            k: Number of similar documents to return
            
        Returns:
            List of similar document chunks with metadata
        """
        if self.index.ntotal == 0:
            return []
        
        query_embedding = self.embedding_model.encode([query])
        query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
        
        scores, indices = self.index.search(query_embedding.astype('float32'), k)
        
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.documents):
                results.append({
                    'text': self.documents[idx],
                    'score': float(score),
                    'metadata': self.metadata[idx]
                })
        
        return results
    
    def generate_answer(self, query: str, context_chunks: List[str]) -> str:
        """
        Generate answer using Gemini with retrieved context
        
        Args:
            query: User question
            context_chunks: Retrieved relevant text chunks
            
        Returns:
            Generated answer
        """
        # Combine context chunks
        context = "\n\n".join(context_chunks)
        
        # Create prompt
        prompt = f"""
Based on the following context, please answer the question. If the answer cannot be found in the context, please say so.

Context:
{context}

Question: {query}

Answer:
"""
        
        try:
            response = self.gemini_model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Error generating response: {e}"
    
    def query(self, question: str, k: int = 5) -> Dict[str, Any]:
        """
        Main query method that retrieves relevant documents and generates answer
        
        Args:
            question: User question
            k: Number of documents to retrieve
            
        Returns:
            Dictionary containing answer, sources, and retrieved documents
        """
        # Retrieve similar documents
        similar_docs = self.search_similar_documents(question, k)
        
        if not similar_docs:
            return {
                'answer': "No relevant documents found in the knowledge base.",
                'sources': [],
                'retrieved_docs': []
            }
        
        # Extract text chunks for context
        context_chunks = [doc['text'] for doc in similar_docs]
        
        # Generate answer
        answer = self.generate_answer(question, context_chunks)
        
        # Extract unique sources
        sources = list(set([doc['metadata']['source'] for doc in similar_docs]))
        
        return {
            'answer': answer,
            'sources': sources,
            'retrieved_docs': similar_docs
        }
    
    def save_index(self, save_path: str) -> None:
        """Save the FAISS index and metadata"""
        # Save FAISS index
        faiss.write_index(self.index, f"{save_path}.faiss")
        
        # Save documents and metadata
        with open(f"{save_path}_data.pkl", 'wb') as f:
            pickle.dump({
                'documents': self.documents,
                'metadata': self.metadata
            }, f)
        
        print(f"Index saved to {save_path}")
    
    def load_index(self, load_path: str) -> None:
        """Load a previously saved FAISS index and metadata"""
        # Load FAISS index
        self.index = faiss.read_index(f"{load_path}.faiss")
        
        with open(f"{load_path}_data.pkl", 'rb') as f:
            data = pickle.load(f)
            self.documents = data['documents']
            self.metadata = data['metadata']
        
        print(f"Index loaded from {load_path}")

def main():
    GEMINI_API_KEY = "API-KEY"  
    rag = RAGSystem(GEMINI_API_KEY)
    
    try:
        rag.add_document(r"C:\Users\Bhumi\Downloads\How-to-stop-overthinking-Kindle-Book.pdf")
        # rag.add_document("document2.docx")/
        
        rag.save_index("my_rag_index")
        
        print("\n=== RAG System Ready ===")
        print("Ask questions about your documents (type 'quit' to exit):")
        
        while True:
            question = input("\nQuestion: ").strip()
            
            if question.lower() in ['quit', 'exit', 'q']:
                break
            
            if not question:
                continue
            
            # Get answer
            result = rag.query(question)
            
            print(f"\nAnswer: {result['answer']}")
            print(f"\nSources: {', '.join(result['sources'])}")
            
            # Optionally show retrieved documents
            show_docs = input("\nShow retrieved documents? (y/n): ").lower() == 'y'
            if show_docs:
                print("\nRetrieved Documents:")
                for i, doc in enumerate(result['retrieved_docs'], 1):
                    print(f"\n{i}. (Score: {doc['score']:.3f})")
                    print(f"Text: {doc['text'][:200]}...")
    
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Added 37 chunks from C:\Users\Bhumi\Downloads\How-to-stop-overthinking-Kindle-Book.pdf
Index saved to my_rag_index

=== RAG System Ready ===
Ask questions about your documents (type 'quit' to exit):



Question:  Explain Junk in, Junk out



Answer: Based on the context, "Junk in, Junk out" is a concept related to the overabundance and unmanageability of information, as indicated by the quote from Neil Postman: "Information has become a form of garbage. We don’t know what to do with it, have no control over it; don’t know how to get rid of it.” It also references keeping "the garbage out of your mind" and not letting "tiny things become toxic".


Sources: C:\Users\Bhumi\Downloads\How-to-stop-overthinking-Kindle-Book.pdf



Show retrieved documents? (y/n):  Why

Question:  y



Answer: I am sorry, but the answer to the question cannot be found in the context.


Sources: C:\Users\Bhumi\Downloads\How-to-stop-overthinking-Kindle-Book.pdf



Show retrieved documents? (y/n):  DO NOT LET THE ‘TINY THINGS BECOME TOXIC’ 

Question:  DO NOT LET THE ‘TINY THINGS BECOME TOXIC’ 



Answer: 12 


Sources: C:\Users\Bhumi\Downloads\How-to-stop-overthinking-Kindle-Book.pdf



Show retrieved documents? (y/n):  DO NOT LET THE ‘TINY THINGS BECOME TOXIC’ 

Question:  DO NOT LET THE ‘TINY THINGS BECOME TOXIC’ , explain this



Answer: The provided text does not explicitly explain "DO NOT LET THE 'TINY THINGS BECOME TOXIC'". However, the surrounding content suggests it refers to the importance of managing one's thoughts and information intake, preventing negative or trivial matters from becoming overwhelming or detrimental. The phrase "Junk in, Junk Out" and the quote from Neil Postman about information overload further support this interpretation.


Sources: C:\Users\Bhumi\Downloads\How-to-stop-overthinking-Kindle-Book.pdf



Show retrieved documents? (y/n):  Summarise the concepts chapter wise

Question:  Summarise the concepts chapter wise



Answer: Here's a chapter-wise summary of the concepts discussed in the provided text:

*   **Chapter 25:** Introduces the concept that "Thought is the Mother of intention".
*   **Chapter 26:** Presents a "FOUNDATIONAL PRINCIPLE" related to a "beautiful mind and a successful life".
*   **Chapter 27:** Continues exploring "Foundational principles of a beautiful mind and a successful life."
*   **Chapter 35:** Focuses on "Stop Overthinking in just three minutes."
*   **Chapter 36:** Highlights the idea that "YOUR MIND IS YOUR LIFE MENTOR."
*   **Chapter 37:** Further discusses "Stop Overthinking in just three minutes."


Sources: C:\Users\Bhumi\Downloads\How-to-stop-overthinking-Kindle-Book.pdf



Show retrieved documents? (y/n):  y



Retrieved Documents:

1. (Score: 0.210)
Text:            D       >)       @ Thought is the Mother of intention 25  +=--,-!,# + #,...

2. (Score: 0.207)
Text:   &      1&            % &    & %  ; 6 & -!? 6@ )!((...

3. (Score: 0.195)
Text:  E  %      & )*  +  , -, #-+ !(+    ! +  -    %= 26  FOUNDATIONAL PRINCIPLE...

4. (Score: 0.193)
Text:             >     AB    &    @         %       ...

5. (Score: 0.161)
Text:  6  4# ( - &  !   # ++-' &&! ++ -  & #


Question:  Summarise the concepts chapter wise from 1



Answer: I am sorry, but I cannot summarise the concepts chapter wise from 1 as the context does not contain chapter information.

Sources: C:\Users\Bhumi\Downloads\How-to-stop-overthinking-Kindle-Book.pdf



Show retrieved documents? (y/n):  n

Question:  who is Dr. Saloni Singh



Answer: Dr. Saloni Singh is the author of the book which provides proven ways to relieve anxiety, stress, confusion and tap into the power of a calm and clear mind.


Sources: C:\Users\Bhumi\Downloads\How-to-stop-overthinking-Kindle-Book.pdf



Show retrieved documents? (y/n):  n

Question:  who is Neil Postman



Answer: Neil Postman is mentioned in the context as the author of the quote: “Information has become a form of garbage. We don’t know what to do with it, have no control over it; don’t know how to get rid of it.”


Sources: C:\Users\Bhumi\Downloads\How-to-stop-overthinking-Kindle-Book.pdf



Show retrieved documents? (y/n):  n

Question:  exit
