In [2]:
# Install required packages
!pip install -q transformers torch sentence-transformers faiss-cpu pypdf2 langchain openai python-dotenv chromadb python-docx google-generativeai anthropic


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m103.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0

In [3]:
# Import necessary libraries
import os
import re
import json
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Optional, Union
from dataclasses import dataclass
import warnings
warnings.filterwarnings('ignore')

# Core libraries
import torch
from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss

# Document processing
import PyPDF2
from docx import Document
from io import BytesIO

# Google Drive integration
from google.colab import drive, files
import zipfile

# Vector database
import chromadb
from chromadb.config import Settings

# LLM APIs
import openai
from dotenv import load_dotenv
try:
    import google.generativeai as genai
except ImportError:
    print("Google Generative AI not available")

# Display and visualization
from IPython.display import display, Markdown, HTML
import matplotlib.pyplot as plt

print("✅ All packages installed successfully!")

✅ All packages installed successfully!


In [4]:
# Load environment variables
load_dotenv()

def setup_llm_credentials():
    """Setup LLM API credentials"""
    print("\n🔑 LLM SETUP OPTIONS:")
    print("=" * 50)
    print("1. OpenAI GPT (requires API key)")
    print("2. Google Gemini (requires API key)")
    print("3. Hugging Face Models (free, runs locally)")
    print("4. Manual API key input")

    choice = input("\nSelect LLM option (1/2/3/4): ").strip()

    credentials = {}

    if choice == "1":
        api_key = os.getenv("OPENAI_API_KEY") or input("Enter OpenAI API Key: ").strip()
        if api_key:
            credentials["openai_key"] = api_key
            print("✅ OpenAI credentials configured")

    elif choice == "2":
        api_key = os.getenv("GOOGLE_API_KEY") or input("Enter Google API Key: ").strip()
        if api_key:
            credentials["google_key"] = api_key
            print("✅ Google Gemini credentials configured")

    elif choice == "3":
        print("✅ Will use Hugging Face models (no API key required)")
        credentials["use_hf"] = True

    elif choice == "4":
        print("Available APIs:")
        print("- OpenAI: Enter 'openai:YOUR_KEY'")
        print("- Google: Enter 'google:YOUR_KEY'")
        print("- Hugging Face: Enter 'hf' for free models")

        manual_input = input("Enter credentials: ").strip()
        if manual_input.startswith("openai:"):
            credentials["openai_key"] = manual_input.split(":", 1)[1]
        elif manual_input.startswith("google:"):
            credentials["google_key"] = manual_input.split(":", 1)[1]
        elif manual_input.lower() == "hf":
            credentials["use_hf"] = True

    return credentials

In [11]:
llm_credentials = setup_llm_credentials()

def mount_drive():
    """Mount Google Drive"""
    try:
        drive.mount('/content/drive')
        print("✅ Google Drive mounted successfully!")
        return True
    except Exception as e:
        print(f"❌ Error mounting Google Drive: {e}")
        return False

def upload_files():
    """Upload files directly to Colab"""
    print("📁 Please upload your documents (PDF, DOCX, TXT files)")
    uploaded = files.upload()

    # Create documents directory
    os.makedirs("documents", exist_ok=True)

    # Move uploaded files to documents directory
    for filename in uploaded.keys():
        os.rename(filename, f"documents/{filename}")
        print(f"✅ Moved {filename} to documents/")

    return list(uploaded.keys())

def list_drive_files(drive_path="/content/drive/MyDrive"):
    """List files in Google Drive"""
    try:
        files_found = []
        for root, dirs, files in os.walk(drive_path):
            for file in files:
                if file.lower().endswith(('.pdf', '.docx', '.txt', '.doc')):
                    full_path = os.path.join(root, file)
                    rel_path = os.path.relpath(full_path, drive_path)
                    files_found.append((file, full_path, rel_path))
        return files_found
    except Exception as e:
        print(f"❌ Error listing drive files: {e}")
        return []

def copy_from_drive(file_paths):
    """Copy selected files from Google Drive to local documents folder"""
    os.makedirs("documents", exist_ok=True)
    copied_files = []

    for file_path in file_paths:
        try:
            filename = os.path.basename(file_path)
            destination = f"documents/{filename}"

            # Copy file
            with open(file_path, 'rb') as src, open(destination, 'wb') as dst:
                dst.write(src.read())

            copied_files.append(filename)
            print(f"✅ Copied: {filename}")

        except Exception as e:
            print(f"❌ Error copying {file_path}: {e}")

    return copied_files


🔑 LLM SETUP OPTIONS:
1. OpenAI GPT (requires API key)
2. Google Gemini (requires API key)
3. Hugging Face Models (free, runs locally)
4. Manual API key input

Select LLM option (1/2/3/4): 3
✅ Will use Hugging Face models (no API key required)


In [12]:
# Document Upload Interface
print("📂 DOCUMENT UPLOAD OPTIONS:")
print("1. Mount Google Drive and select files")
print("2. Upload files directly")
print("3. Skip (if documents already in documents/ folder)")

choice = input("\nEnter your choice (1/2/3): ").strip()

uploaded_files = []

if choice == "1":
    if mount_drive():
        print("\n📁 Scanning Google Drive for documents...")
        drive_files = list_drive_files()

        if drive_files:
            print(f"\n📋 Found {len(drive_files)} documents in Google Drive:")
            for i, (filename, full_path, rel_path) in enumerate(drive_files, 1):
                print(f"{i}. {filename} ({rel_path})")

            # Let user select files
            selection = input(f"\nEnter file numbers to use (e.g., 1,3,5) or 'all' for all files: ").strip()

            if selection.lower() == 'all':
                selected_paths = [full_path for _, full_path, _ in drive_files]
            else:
                try:
                    indices = [int(x.strip()) - 1 for x in selection.split(',')]
                    selected_paths = [drive_files[i][1] for i in indices if 0 <= i < len(drive_files)]
                except:
                    print("❌ Invalid selection. Using all files.")
                    selected_paths = [full_path for _, full_path, _ in drive_files]

            uploaded_files = copy_from_drive(selected_paths)
        else:
            print("❌ No documents found in Google Drive")

elif choice == "2":
    uploaded_files = upload_files()

elif choice == "3":
    # Check if documents folder exists and has files
    if os.path.exists("documents"):
        uploaded_files = [f for f in os.listdir("documents")
                         if f.lower().endswith(('.pdf', '.docx', '.txt', '.doc'))]
        if uploaded_files:
            print(f"✅ Found {len(uploaded_files)} documents in documents/ folder")
        else:
            print("❌ No documents found in documents/ folder")
    else:
        print("❌ documents/ folder not found")

else:
    print("❌ Invalid choice")

if not uploaded_files:
    print("⚠️ No documents available. Please upload documents first.")
else:
    print(f"\n🎉 Ready to process {len(uploaded_files)} documents!")
    for file in uploaded_files:
        print(f"  📄 {file}")

@dataclass
class DocumentChunk:
    """Represents a chunk of text from a document"""
    text: str
    source: str
    page_number: int
    chunk_id: str
    metadata: Dict

class DocumentProcessor:
    """Handles document loading, cleaning, and chunking"""

    def __init__(self, chunk_size: int = 1000, overlap: int = 200):
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.supported_formats = ['.pdf', '.docx', '.txt', '.doc']

    def extract_text_from_pdf(self, pdf_path: str) -> List[Tuple[str, int]]:
        """Extract text from PDF file"""
        text_pages = []

        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                for page_num, page in enumerate(pdf_reader.pages):
                    text = page.extract_text()
                    if text.strip():
                        text_pages.append((text, page_num + 1))

        except Exception as e:
            print(f"❌ Error reading PDF {pdf_path}: {str(e)}")

        return text_pages

    def extract_text_from_docx(self, docx_path: str) -> List[Tuple[str, int]]:
        """Extract text from DOCX file"""
        try:
            doc = Document(docx_path)
            text_pages = []
            current_text = ""

            for paragraph in doc.paragraphs:
                current_text += paragraph.text + "\n"

            if current_text.strip():
                text_pages.append((current_text, 1))

            return text_pages

        except Exception as e:
            print(f"❌ Error reading DOCX {docx_path}: {str(e)}")
            return []

    def extract_text_from_txt(self, txt_path: str) -> List[Tuple[str, int]]:
        """Extract text from TXT file"""
        try:
            with open(txt_path, 'r', encoding='utf-8') as file:
                text = file.read()
                return [(text, 1)] if text.strip() else []
        except UnicodeDecodeError:
            try:
                with open(txt_path, 'r', encoding='latin-1') as file:
                    text = file.read()
                    return [(text, 1)] if text.strip() else []
            except Exception as e:
                print(f"❌ Error reading TXT {txt_path}: {str(e)}")
                return []
        except Exception as e:
            print(f"❌ Error reading TXT {txt_path}: {str(e)}")
            return []

    def extract_text_from_file(self, file_path: str) -> List[Tuple[str, int]]:
        """Extract text from any supported file format"""
        file_ext = os.path.splitext(file_path)[1].lower()

        if file_ext == '.pdf':
            return self.extract_text_from_pdf(file_path)
        elif file_ext in ['.docx', '.doc']:
            return self.extract_text_from_docx(file_path)
        elif file_ext == '.txt':
            return self.extract_text_from_txt(file_path)
        else:
            print(f"❌ Unsupported file format: {file_ext}")
            return []

    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep punctuation
        text = re.sub(r'[^\w\s.,!?;:()\-\'\""]', '', text)
        # Fix common PDF extraction issues
        text = re.sub(r'(\w)-\s+(\w)', r'\1\2', text)  # Fix hyphenated words

        return text.strip()

    def create_chunks(self, text: str, source: str, page_number: int) -> List[DocumentChunk]:
        """Create overlapping chunks from text"""
        chunks = []
        words = text.split()

        if len(words) < 10:  # Skip very short texts
            return chunks

        for i in range(0, len(words), self.chunk_size - self.overlap):
            chunk_words = words[i:i + self.chunk_size]
            chunk_text = ' '.join(chunk_words)

            if len(chunk_text.strip()) > 50:  # Only keep substantial chunks
                chunk_id = f"{source}_page_{page_number}_chunk_{len(chunks)}"

                chunk = DocumentChunk(
                    text=chunk_text,
                    source=source,
                    page_number=page_number,
                    chunk_id=chunk_id,
                    metadata={
                        'word_count': len(chunk_words),
                        'char_count': len(chunk_text),
                        'file_type': os.path.splitext(source)[1]
                    }
                )
                chunks.append(chunk)

        return chunks

    def process_documents(self, documents_folder: str = "documents") -> List[DocumentChunk]:
        """Process all documents in folder and return chunks"""
        all_chunks = []

        if not os.path.exists(documents_folder):
            print(f"❌ Documents folder '{documents_folder}' not found")
            return all_chunks

        files = [f for f in os.listdir(documents_folder)
                if os.path.splitext(f)[1].lower() in self.supported_formats]

        if not files:
            print(f"❌ No supported documents found in '{documents_folder}'")
            return all_chunks

        print(f"🔄 Processing {len(files)} documents...")

        for filename in files:
            print(f"📄 Processing: {filename}")

            file_path = os.path.join(documents_folder, filename)

            # Extract text from file
            text_pages = self.extract_text_from_file(file_path)

            if not text_pages:
                print(f"⚠️ No text extracted from {filename}")
                continue

            # Process each page
            doc_chunks = 0
            for text, page_num in text_pages:
                cleaned_text = self.clean_text(text)
                if cleaned_text:
                    chunks = self.create_chunks(cleaned_text, filename, page_num)
                    all_chunks.extend(chunks)
                    doc_chunks += len(chunks)

            print(f"✅ Created {doc_chunks} chunks from {filename}")

        return all_chunks

📂 DOCUMENT UPLOAD OPTIONS:
1. Mount Google Drive and select files
2. Upload files directly
3. Skip (if documents already in documents/ folder)

Enter your choice (1/2/3): 2
📁 Please upload your documents (PDF, DOCX, TXT files)


Saving 2005.11401v4.pdf to 2005.11401v4.pdf
Saving 1706.03762v7.pdf to 1706.03762v7.pdf
Saving 2005.14165v4.pdf to 2005.14165v4.pdf
✅ Moved 2005.11401v4.pdf to documents/
✅ Moved 1706.03762v7.pdf to documents/
✅ Moved 2005.14165v4.pdf to documents/

🎉 Ready to process 3 documents!
  📄 2005.11401v4.pdf
  📄 1706.03762v7.pdf
  📄 2005.14165v4.pdf


In [13]:
# Process documents
print("\n🔄 Processing your documents...")
processor = DocumentProcessor(chunk_size=800, overlap=150)
document_chunks = processor.process_documents("documents")

if document_chunks:
    print(f"\n📊 Processing Summary:")
    print(f"Total chunks created: {len(document_chunks)}")

    # Group by source
    source_counts = {}
    for chunk in document_chunks:
        source_counts[chunk.source] = source_counts.get(chunk.source, 0) + 1

    for source, count in source_counts.items():
        print(f"  📄 {source}: {count} chunks")
else:
    print("❌ No chunks created. Please check your documents.")

class EmbeddingModel:
    """Handles text embeddings using sentence transformers"""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        print(f"🔄 Loading embedding model: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.dimension = self.model.get_sentence_embedding_dimension()
        print(f"✅ Embedding model loaded. Dimension: {self.dimension}")

    def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """Encode texts to embeddings"""
        return self.model.encode(texts, batch_size=batch_size, show_progress_bar=True)

    def encode_single(self, text: str) -> np.ndarray:
        """Encode single text to embedding"""
        return self.model.encode([text])[0]

class VectorStore:
    """FAISS-based vector store for efficient similarity search"""

    def __init__(self, embedding_model: EmbeddingModel):
        self.embedding_model = embedding_model
        self.index = None
        self.chunks = []
        self.embeddings = None

    def build_index(self, chunks: List[DocumentChunk]) -> None:
        """Build FAISS index from document chunks"""
        if not chunks:
            print("❌ No chunks provided for indexing")
            return

        print("🔄 Building vector index...")

        self.chunks = chunks
        texts = [chunk.text for chunk in chunks]

        # Generate embeddings
        print("🔄 Generating embeddings...")
        self.embeddings = self.embedding_model.encode(texts)

        # Build FAISS index
        self.index = faiss.IndexFlatIP(self.embedding_model.dimension)  # Inner product for cosine similarity

        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(self.embeddings)
        self.index.add(self.embeddings)

        print(f"✅ Vector index built with {len(chunks)} chunks")

    def search(self, query: str, k: int = 5) -> List[Tuple[DocumentChunk, float]]:
        """Search for most relevant chunks"""
        if self.index is None:
            raise ValueError("Index not built. Call build_index first.")

        # Encode query
        query_embedding = self.embedding_model.encode_single(query).reshape(1, -1)
        faiss.normalize_L2(query_embedding)

        # Search
        scores, indices = self.index.search(query_embedding, k)

        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx < len(self.chunks):
                results.append((self.chunks[idx], float(score)))

        return results


🔄 Processing your documents...
🔄 Processing 3 documents...
📄 Processing: 2005.11401v4.pdf
✅ Created 22 chunks from 2005.11401v4.pdf
📄 Processing: 2005.14165v4.pdf
✅ Created 93 chunks from 2005.14165v4.pdf
📄 Processing: 1706.03762v7.pdf
✅ Created 15 chunks from 1706.03762v7.pdf

📊 Processing Summary:
Total chunks created: 130
  📄 2005.11401v4.pdf: 22 chunks
  📄 2005.14165v4.pdf: 93 chunks
  📄 1706.03762v7.pdf: 15 chunks


In [14]:
# Initialize embedding model and vector store (only if we have chunks)
if document_chunks:
    embedding_model = EmbeddingModel("all-MiniLM-L6-v2")
    vector_store = VectorStore(embedding_model)
    vector_store.build_index(document_chunks)
    print("✅ Vector store ready!")
else:
    print("⚠️ Skipping vector store creation - no document chunks available")

class LLMInterface:
    """Unified interface for different LLM providers"""

    def __init__(self, credentials: Dict):
        self.credentials = credentials
        self.llm_type = None
        self.model = None
        self.tokenizer = None

        self._setup_llm()

    def _setup_llm(self):
        """Setup the LLM based on available credentials"""

        if "openai_key" in self.credentials:
            self.llm_type = "openai"
            openai.api_key = self.credentials["openai_key"]
            print("✅ OpenAI LLM configured")

        elif "google_key" in self.credentials:
            self.llm_type = "google"
            try:
                genai.configure(api_key=self.credentials["google_key"])
                self.model = genai.GenerativeModel('gemini-pro')
                print("✅ Google Gemini LLM configured")
            except Exception as e:
                print(f"❌ Error configuring Google Gemini: {e}")
                self._setup_huggingface_fallback()

        elif "use_hf" in self.credentials:
            self._setup_huggingface()

        else:
            print("⚠️ No LLM credentials provided, using Hugging Face fallback")
            self._setup_huggingface_fallback()

    def _setup_huggingface(self):
        """Setup Hugging Face model"""
        self.llm_type = "huggingface"

        print("🔄 Loading Hugging Face model...")
        print("Choose a model:")
        print("1. microsoft/DialoGPT-medium (Conversational)")
        print("2. google/flan-t5-base (Instruction following)")
        print("3. distilgpt2 (Lightweight)")

        model_choice = input("Select model (1/2/3) [default: 2]: ").strip() or "2"

        model_map = {
            "1": "microsoft/DialoGPT-medium",
            "2": "google/flan-t5-base",
            "3": "distilgpt2"
        }

        model_name = model_map.get(model_choice, "google/flan-t5-base")

        try:
            if "flan-t5" in model_name:
                from transformers import T5ForConditionalGeneration, T5Tokenizer
                self.model = T5ForConditionalGeneration.from_pretrained(model_name)
                self.tokenizer = T5Tokenizer.from_pretrained(model_name)
                self.model_name = model_name
            else:
                self.model = AutoModelForCausalLM.from_pretrained(model_name)
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model_name = model_name

            print(f"✅ Hugging Face model loaded: {model_name}")

        except Exception as e:
            print(f"❌ Error loading {model_name}: {e}")
            self._setup_huggingface_fallback()

    def _setup_huggingface_fallback(self):
        """Fallback to a simple HF model"""
        self.llm_type = "huggingface"
        model_name = "distilgpt2"

        try:
            self.model = AutoModelForCausalLM.from_pretrained(model_name)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model_name = model_name

            # Add pad token if not present
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

            print(f"✅ Fallback Hugging Face model loaded: {model_name}")

        except Exception as e:
            print(f"❌ Critical error loading fallback model: {e}")
            raise

    def generate_response(self, prompt: str, max_length: int = 500) -> str:
        """Generate response using the configured LLM"""

        try:
            if self.llm_type == "openai":
                return self._generate_openai(prompt, max_length)
            elif self.llm_type == "google":
                return self._generate_google(prompt, max_length)
            elif self.llm_type == "huggingface":
                return self._generate_huggingface(prompt, max_length)
            else:
                return "Error: No LLM configured"

        except Exception as e:
            print(f"❌ Error generating response: {e}")
            return f"Error generating response: {str(e)}"

    def _generate_openai(self, prompt: str, max_length: int) -> str:
        """Generate response using OpenAI"""
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_length,
                temperature=0.7
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"OpenAI API error: {e}")
            return "Error with OpenAI API. Please check your API key and quota."

    def _generate_google(self, prompt: str, max_length: int) -> str:
        """Generate response using Google Gemini"""
        try:
            response = self.model.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            print(f"Google Gemini error: {e}")
            return "Error with Google Gemini API. Please check your API key."

    def _generate_huggingface(self, prompt: str, max_length: int) -> str:
        """Generate response using Hugging Face model"""
        try:
            if "flan-t5" in self.model_name:
                # T5 models expect specific input format
                inputs = self.tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)

                with torch.no_grad():
                    outputs = self.model.generate(
                        inputs,
                        max_length=max_length,
                        num_return_sequences=1,
                        temperature=0.7,
                        do_sample=True,
                        pad_token_id=self.tokenizer.pad_token_id
                    )

                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                return response.strip()

            else:
                # GPT-style models
                inputs = self.tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)

                with torch.no_grad():
                    outputs = self.model.generate(
                        inputs,
                        max_length=len(inputs[0]) + max_length,
                        num_return_sequences=1,
                        temperature=0.7,
                        do_sample=True,
                        pad_token_id=self.tokenizer.pad_token_id
                    )

                # Extract only the new generated text
                response = self.tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
                return response.strip()

        except Exception as e:
            print(f"Hugging Face model error: {e}")
            return "Error generating response with local model."

class EnhancedAnswerGenerator:
    """Enhanced answer generator using proper LLM"""

    def __init__(self, llm_interface: LLMInterface):
        self.llm = llm_interface
        print("✅ Enhanced answer generator ready!")

    def generate_answer(self, query: str, context_chunks: List[DocumentChunk], max_length: int = 400) -> str:
        """Generate answer based on query and retrieved context using LLM"""

        if not context_chunks:
            return "No relevant context found to answer this question."

        # Prepare context from top chunks
        context_texts = []
        for i, chunk in enumerate(context_chunks[:3], 1):  # Use top 3 chunks
            source_name = os.path.splitext(chunk.source)[0]
            # Limit chunk text to avoid token limits
            chunk_text = chunk.text[:500] + "..." if len(chunk.text) > 500 else chunk.text
            context_texts.append(f"Source {i} ({source_name}):\n{chunk_text}")

        context = "\n\n".join(context_texts)

        # Create a well-structured prompt
        prompt = f"""Based on the following document excerpts, provide a comprehensive and accurate answer to the question. Use information from the sources provided and cite them when relevant.

CONTEXT:
{context}

QUESTION: {query}

INSTRUCTIONS:
- Provide a detailed and informative answer
- Use only information from the provided context
- If the context doesn't contain enough information, state this clearly
- Be concise but comprehensive
- Cite sources when making specific claims

ANSWER:"""

        # Generate answer using the LLM
        answer = self.llm.generate_response(prompt, max_length)

        # Clean up the answer
        answer = self._clean_answer(answer)

        return answer if answer else self._fallback_answer(query, context_chunks)

    def _clean_answer(self, answer: str) -> str:
        """Clean and format the generated answer"""
        # Remove common artifacts
        answer = re.sub(r'^(Answer:|ANSWER:)\s*', '', answer, flags=re.IGNORECASE)
        answer = re.sub(r'\n+', ' ', answer)  # Replace multiple newlines with space
        answer = re.sub(r'\s+', ' ', answer)  # Replace multiple spaces with single space

        # Remove incomplete sentences at the end
        sentences = re.split(r'[.!?]+', answer)
        if len(sentences) > 1 and len(sentences[-1].strip()) < 10:
            answer = '.'.join(sentences[:-1]) + '.'

        return answer.strip()

    def _fallback_answer(self, query: str, context_chunks: List[DocumentChunk]) -> str:
        """Fallback answer generation using simple extraction"""
        print("🔄 Using fallback answer generation...")

        # Simple keyword-based extraction
        query_words = set(query.lower().split())
        relevant_sentences = []

        for chunk in context_chunks[:2]:
            sentences = re.split(r'[.!?]+', chunk.text)
            for sentence in sentences:
                if len(sentence.strip()) > 20:
                    sentence_words = set(sentence.lower().split())
                    overlap = len(query_words.intersection(sentence_words))

                    if overlap > 0:
                        relevant_sentences.append((sentence.strip(), overlap))

        # Sort by relevance and take top sentences
        relevant_sentences.sort(key=lambda x: x[1], reverse=True)

        if relevant_sentences:
            answer_parts = [sent[0] for sent in relevant_sentences[:3]]
            return ". ".join(answer_parts) + "."
        else:
            return "Based on the available documents, I cannot provide a specific answer to this question. The documents may not contain relevant information about this topic."


🔄 Loading embedding model: all-MiniLM-L6-v2
✅ Embedding model loaded. Dimension: 384
🔄 Building vector index...
🔄 Generating embeddings...


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Vector index built with 130 chunks
✅ Vector store ready!


In [15]:
# Initialize LLM and enhanced answer generator
if document_chunks:
    print("\n🔄 Initializing LLM interface...")
    llm_interface = LLMInterface(llm_credentials)
    answer_generator = EnhancedAnswerGenerator(llm_interface)
    print("✅ Enhanced answer generator ready!")

class EnhancedRAGSystem:
    """Complete RAG system with enhanced LLM integration"""

    def __init__(self, vector_store: VectorStore, answer_generator: EnhancedAnswerGenerator):
        self.vector_store = vector_store
        self.answer_generator = answer_generator

    def answer_question(self, query: str, k: int = 5, include_metadata: bool = True) -> Dict:
        """Answer a question using enhanced RAG approach"""

        # Step 1: Retrieve relevant chunks
        print(f"🔍 Searching for relevant information...")
        retrieved_chunks = self.vector_store.search(query, k=k)

        if not retrieved_chunks:
            return {
                'query': query,
                'answer': "No relevant information found in the documents to answer this question."
            }

        # Unzip the retrieved chunks and scores
        chunks, scores = zip(*retrieved_chunks)

        # Step 2: Generate answer using LLM
        print(f"🧠 Generating answer...")
        answer = self.answer_generator.generate_answer(query, chunks)

        # Prepare response with metadata if requested
        result = {
            'query': query,
            'answer': answer,
            'relevant_chunks': [
                {
                    'text': chunk.text,
                    'source': chunk.source,
                    'page_number': chunk.page_number,
                    'chunk_id': chunk.chunk_id,
                    'score': float(scores[i]),
                    'metadata': chunk.metadata
                }
                for i, chunk in enumerate(chunks)
            ]
        }

        return result


🔄 Initializing LLM interface...
🔄 Loading Hugging Face model...
Choose a model:
1. microsoft/DialoGPT-medium (Conversational)
2. google/flan-t5-base (Instruction following)
3. distilgpt2 (Lightweight)
Select model (1/2/3) [default: 2]: 1


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

✅ Hugging Face model loaded: microsoft/DialoGPT-medium
✅ Enhanced answer generator ready!
✅ Enhanced answer generator ready!


In [17]:
pip install reportlab

Collecting reportlab
  Downloading reportlab-4.4.1-py3-none-any.whl.metadata (1.8 kB)
Downloading reportlab-4.4.1-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.1


In [21]:
# # Example usage:
# if document_chunks:
#     rag_system = EnhancedRAGSystem(vector_store, answer_generator)

#     def ask_question():
#         """Interactive function to ask questions"""
#         print("\n\n❓ Ask a question about your documents:")
#         while True:
#             try:
#                 query = input("Your question (type 'exit' to quit): ").strip()
#                 if query.lower() in ['exit', 'quit']:
#                     break
#                 if not query:
#                     continue

#                 result = rag_system.answer_question(query)

#                 print("\n🤖 Answer:")
#                 print(result['answer'])

#                 print("\n📚 Relevant Document Excerpts:")
#                 for i, chunk in enumerate(result['relevant_chunks'][:3], 1):
#                     print(f"\n{i}. Source: {chunk['source']} (Page {chunk['page_number']})")
#                     print(f"   Score: {chunk['score']:.2f}")
#                     print(f"   Text: {chunk['text'][:500]}...")  # Truncate long text

#             except KeyboardInterrupt:
#                 print("\nExiting...")
#                 break
#             except Exception as e:
#                 print(f"Error: {e}")

#     # Start interactive Q&A session
#     ask_question()
# else:
#     print("❌ Cannot initialize RAG system - no documents processed")



from reportlab.lib.pagesizes import letter, A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib.colors import HexColor
from datetime import datetime
import os

class QASession:
    """Class to store Q&A session data"""
    def __init__(self):
        self.qa_pairs = []
        self.session_start = datetime.now()

    def add_qa_pair(self, question, answer, relevant_chunks):
        """Add a question-answer pair to the session"""
        self.qa_pairs.append({
            'question': question,
            'answer': answer,
            'relevant_chunks': relevant_chunks,
            'timestamp': datetime.now()
        })

    def generate_pdf(self, filename=None):
        """Generate PDF from Q&A session"""
        if not filename:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"qa_session_{timestamp}.pdf"

        # Create PDF document
        doc = SimpleDocTemplate(filename, pagesize=A4)

        # Define styles
        styles = getSampleStyleSheet()
        title_style = ParagraphStyle(
            'CustomTitle',
            parent=styles['Heading1'],
            fontSize=16,
            spaceAfter=30,
            textColor=HexColor('#2E86AB'),
            alignment=1  # Center alignment
        )

        question_style = ParagraphStyle(
            'Question',
            parent=styles['Heading2'],
            fontSize=12,
            spaceAfter=12,
            textColor=HexColor('#A23B72'),
            leftIndent=0
        )

        answer_style = ParagraphStyle(
            'Answer',
            parent=styles['Normal'],
            fontSize=10,
            spaceAfter=15,
            leftIndent=20,
            rightIndent=20
        )

        source_style = ParagraphStyle(
            'Source',
            parent=styles['Normal'],
            fontSize=8,
            spaceAfter=8,
            leftIndent=40,
            textColor=HexColor('#666666'),
            italic=True
        )

        # Build PDF content
        story = []

        # Title
        story.append(Paragraph("Q&A Session Report", title_style))
        story.append(Spacer(1, 20))

        # Session info
        session_info = f"<b>Session Date:</b> {self.session_start.strftime('%B %d, %Y at %I:%M %p')}<br/>"
        session_info += f"<b>Total Questions:</b> {len(self.qa_pairs)}<br/>"
        session_info += f"<b>Duration:</b> {(datetime.now() - self.session_start).seconds // 60} minutes"

        story.append(Paragraph(session_info, styles['Normal']))
        story.append(Spacer(1, 30))

        # Q&A pairs
        for i, qa in enumerate(self.qa_pairs, 1):
            # Question
            question_text = f"<b>Question {i}:</b> {qa['question']}"
            story.append(Paragraph(question_text, question_style))

            # Answer
            answer_text = f"<b>Answer:</b><br/>{qa['answer']}"
            story.append(Paragraph(answer_text, answer_style))

            # Relevant sources
            if qa['relevant_chunks']:
                story.append(Paragraph("<b>Relevant Sources:</b>", answer_style))
                for j, chunk in enumerate(qa['relevant_chunks'][:3], 1):
                    source_text = f"{j}. <b>Source:</b> {chunk['source']} (Page {chunk['page_number']}) - Score: {chunk['score']:.2f}"
                    story.append(Paragraph(source_text, source_style))

                    # Truncate long text for PDF
                    chunk_text = chunk['text'][:300] + "..." if len(chunk['text']) > 300 else chunk['text']
                    story.append(Paragraph(f"<i>Excerpt:</i> {chunk_text}", source_style))
                    story.append(Spacer(1, 5))

            story.append(Spacer(1, 20))

            # Add page break after every 2 questions (optional)
            if i % 2 == 0 and i < len(self.qa_pairs):
                story.append(PageBreak())

        # Generate PDF
        doc.build(story)
        return filename

# Example usage with PDF export:
if document_chunks:
    rag_system = EnhancedRAGSystem(vector_store, answer_generator)
    qa_session = QASession()  # Initialize session tracker

    def ask_question():
        """Interactive function to ask questions with PDF export"""
        print("\n\n❓ Ask a question about your documents:")
        print("💡 Tip: All your questions and answers will be saved to PDF when you exit")

        while True:
            try:
                query = input("\nYour question (type 'exit' to quit and download PDF): ").strip()

                if query.lower() in ['exit', 'quit']:
                    # Generate and download PDF before exiting
                    if qa_session.qa_pairs:
                        print("\n📄 Generating PDF report...")
                        try:
                            pdf_filename = qa_session.generate_pdf()
                            print(f"✅ PDF report saved as: {pdf_filename}")
                            print(f"📁 Location: {os.path.abspath(pdf_filename)}")
                        except Exception as e:
                            print(f"❌ Error generating PDF: {e}")
                            # Fallback: save as text file
                            txt_filename = f"qa_session_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
                            with open(txt_filename, 'w', encoding='utf-8') as f:
                                f.write(f"Q&A Session Report\n")
                                f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                                f.write("=" * 50 + "\n\n")

                                for i, qa in enumerate(qa_session.qa_pairs, 1):
                                    f.write(f"Question {i}: {qa['question']}\n")
                                    f.write(f"Answer: {qa['answer']}\n")
                                    f.write(f"Timestamp: {qa['timestamp'].strftime('%Y-%m-%d %H:%M:%S')}\n")
                                    f.write("-" * 30 + "\n\n")

                            print(f"📝 Fallback: Text report saved as: {txt_filename}")
                    else:
                        print("📭 No questions asked in this session")

                    print("👋 Thank you for using the RAG system!")
                    break

                if not query:
                    continue

                # Get answer from RAG system
                result = rag_system.answer_question(query)

                # Store Q&A pair
                qa_session.add_qa_pair(query, result['answer'], result['relevant_chunks'])

                # Display results
                print(f"\n🤖 Answer:")
                print(result['answer'])

                print(f"\n📚 Relevant Document Excerpts:")
                for i, chunk in enumerate(result['relevant_chunks'][:3], 1):
                    print(f"\n{i}. Source: {chunk['source']} (Page {chunk['page_number']})")
                    print(f"   Score: {chunk['score']:.2f}")
                    print(f"   Text: {chunk['text'][:500]}...")  # Truncate long text

                print(f"\n📊 Session Stats: {len(qa_session.qa_pairs)} questions asked")

            except KeyboardInterrupt:
                print("\n\n⚠️  Interrupted! Generating PDF before exit...")
                if qa_session.qa_pairs:
                    try:
                        pdf_filename = qa_session.generate_pdf()
                        print(f"✅ PDF report saved as: {pdf_filename}")
                    except Exception as e:
                        print(f"❌ Error generating PDF: {e}")
                break
            except Exception as e:
                print(f"❌ Error: {e}")

    # Start interactive Q&A session
    ask_question()
else:
    print("❌ Cannot initialize RAG system - no documents processed")

# Optional: Function to customize PDF generation
def generate_custom_pdf(qa_session, filename=None, include_sources=True, max_sources=3):
    """
    Generate a customized PDF report

    Args:
        qa_session: QASession object with Q&A data
        filename: Custom filename for the PDF
        include_sources: Whether to include source excerpts
        max_sources: Maximum number of sources to include per question
    """
    if not filename:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"custom_qa_report_{timestamp}.pdf"

    # Custom PDF generation logic here
    return qa_session.generate_pdf(filename)



❓ Ask a question about your documents:
💡 Tip: All your questions and answers will be saved to PDF when you exit

Your question (type 'exit' to quit and download PDF): What is the main difference between the RAG-Sequence and RAG-Token models proposed in the paper, and how does each approach impact the way retrieved documents are used during generation?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


🔍 Searching for relevant information...
🧠 Generating answer...
🔄 Using fallback answer generation...

🤖 Answer:
Concretely, the top K documents are retrieved using the retriever, and then the generator produces a distribution for the next output token for each document, before marginalizing, and repeating the process with the following output token, Formally, we deﬁne: pRAG-Token (yjx)NY iX z2top-k(p(jx))p(zjx)p(yijx;z;y 1:i1) Finally, we note that RAG can be used for sequence classiﬁcation tasks by considering the target class as a target sequence of length one, in which case RAG-Sequence and RAG-Token are equivalent. Concretely, the top K documents are retrieved using the retriever, and the generator produces the output sequence probability for each document, which are then marginalized, pRAG-Sequence (yjx)X z2top-k(p(jx))p(zjx)p(yjx;z) X z2top-k(p(jx))p(zjx)NY ip(yijx;z;y 1:i1) RAG-Token Model In the RAG-Token model we can draw a different latent document for each target token and m