In [7]:
# Install required packages (run this first in your environment)
!pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client
!pip install langchain-community langchain-chroma langchain-openai python-dotenv



In [31]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Cell 3: Imports
import glob
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
# IMPORTANT: Replace this with your actual OpenAI API key
OPENAI_KEY = "sk-proj-...."  # Put your real key here

In [33]:
# Cell 5: Simple RAG Class for Colab
class ColabDriveRAG:
    def __init__(self, openai_api_key: str):
        self.openai_api_key = openai_api_key
        self.vectorstore = None
        self.retriever = None
        self.rag_chain = None

    def find_rag_folder(self):
        """Find the Rag folder in Google Drive"""
        # Common paths where the folder might be
        possible_paths = [
            "/content/drive/MyDrive/Rag",
            "/content/drive/My Drive/Rag",
            "/content/drive/MyDrive/rag",
            "/content/drive/My Drive/rag"
        ]

        print("Looking for Rag folder in Google Drive...")
        for path in possible_paths:
            if os.path.exists(path):
                print(f"✓ Found Rag folder at: {path}")
                return path

        # If not found in common locations, search more broadly
        print("Folder not found in common locations. Searching...")

        # Search in MyDrive root and one level deep
        search_paths = [
            "/content/drive/MyDrive",
            "/content/drive/My Drive"
        ]

        for search_path in search_paths:
            if os.path.exists(search_path):
                # Check root level
                for item in os.listdir(search_path):
                    item_path = os.path.join(search_path, item)
                    if os.path.isdir(item_path) and item.lower() == 'rag':
                        print(f"✓ Found Rag folder at: {item_path}")
                        return item_path

                # Check one level deeper
                for item in os.listdir(search_path):
                    item_path = os.path.join(search_path, item)
                    if os.path.isdir(item_path):
                        try:
                            for subitem in os.listdir(item_path):
                                subitem_path = os.path.join(item_path, subitem)
                                if os.path.isdir(subitem_path) and subitem.lower() == 'rag':
                                    print(f"✓ Found Rag folder at: {subitem_path}")
                                    return subitem_path
                        except PermissionError:
                            continue

        return None

    def list_all_folders(self):
        """List all folders in Google Drive to help user find their folder"""
        print("\n=== All folders in your Google Drive ===")
        search_paths = [
            "/content/drive/MyDrive",
            "/content/drive/My Drive"
        ]

        for search_path in search_paths:
            if os.path.exists(search_path):
                print(f"\nFolders in {search_path}:")
                try:
                    for item in os.listdir(search_path):
                        item_path = os.path.join(search_path, item)
                        if os.path.isdir(item_path):
                            print(f"  📁 {item}")
                except Exception as e:
                    print(f"  Error reading directory: {e}")

    def load_documents_from_path(self, folder_path: str):
        """Load PDF documents from the specified folder path"""
        if not os.path.exists(folder_path):
            raise Exception(f"Folder not found: {folder_path}")

        # Find all PDF files in the folder
        pdf_pattern = os.path.join(folder_path, "*.pdf")
        pdf_files = glob.glob(pdf_pattern)

        if not pdf_files:
            raise Exception(f"No PDF files found in folder: {folder_path}")

        print(f"Found {len(pdf_files)} PDF files:")
        for pdf_file in pdf_files:
            file_size = os.path.getsize(pdf_file) / (1024 * 1024)  # Convert to MB
            print(f"  - {os.path.basename(pdf_file)} ({file_size:.2f} MB)")

        # Load all PDF documents
        documents = []
        for pdf_file in pdf_files:
            print(f"\nProcessing: {os.path.basename(pdf_file)}")

            try:
                loader = PyPDFLoader(pdf_file, extract_images=True)
                file_documents = loader.load()

                # Add source information to metadata
                for doc in file_documents:
                    doc.metadata['source'] = os.path.basename(pdf_file)
                    doc.metadata['file_path'] = pdf_file

                documents.extend(file_documents)
                print(f"✓ Loaded {len(file_documents)} pages from {os.path.basename(pdf_file)}")

            except Exception as e:
                print(f"✗ Error loading {os.path.basename(pdf_file)}: {e}")
                continue

        print(f"\n✓ Total pages loaded: {len(documents)}")
        return documents

    def create_vector_store(self, documents):
        """Create vector store from documents"""
        if not documents:
            raise Exception("No documents provided for vector store creation")

        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            add_start_index=True
        )
        all_splits = text_splitter.split_documents(documents)

        print(f"Total number of chunks after splitting: {len(all_splits)}")

        # Calculate chunk statistics
        chunk_sizes = [len(chunk.page_content) for chunk in all_splits]
        print(f"Average chunk size: {sum(chunk_sizes) / len(chunk_sizes):.0f} characters")
        print(f"Chunk size range: {min(chunk_sizes)} - {max(chunk_sizes)} characters")

        # Create vector store
        print("Creating embeddings... (this may take a moment)")
        self.vectorstore = Chroma.from_documents(
            documents=all_splits,
            embedding=OpenAIEmbeddings(api_key=self.openai_api_key)
        )

        print(f"✓ Vector store created with {self.vectorstore._collection.count()} embeddings")

        # Create retriever
        self.retriever = self.vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 6}
        )

    def setup_rag_chain(self):
        """Setup the RAG chain for question answering"""
        if not self.retriever:
            raise Exception("Vector store not created. Call create_vector_store() first.")

        # Initialize LLM
        llm = ChatOpenAI(
            model="gpt-4",
            api_key=self.openai_api_key,
            temperature=0
        )

        # Get RAG prompt template
        prompt = hub.pull("rlm/rag-prompt")

        # Create RAG chain with sources
        self.rag_chain = RunnableParallel(
            {"context": self.retriever, "question": RunnablePassthrough()}
        ).assign(
            answer=prompt | llm | StrOutputParser()
        )

        print("✓ RAG chain setup complete!")

    def query(self, question: str):
        """Query the RAG system"""
        if not self.rag_chain:
            raise Exception("RAG chain not set up. Call setup_rag_chain() first.")

        result = self.rag_chain.invoke(question)

        # Format sources for display
        sources = []
        for i, doc in enumerate(result['context']):
            source_name = doc.metadata.get('source', 'Unknown')
            page = doc.metadata.get('page', 'Unknown')
            sources.append({
                'index': i + 1,
                'source': source_name,
                'page': page,
                'content_preview': doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
            })

        return {
            'answer': result['answer'],
            'sources': sources
        }

In [38]:
# Cell 6: Main function
def main():
    # Validate OpenAI API key
    if not OPENAI_KEY or len(OPENAI_KEY) < 20 or not OPENAI_KEY.startswith('sk-'):
        print("❌ ERROR: Please set your OpenAI API key!")
        print("Edit the OPENAI_KEY variable with your actual key.")
        print("Your key should start with 'sk-' and be about 50+ characters long.")
        print(f"Current key length: {len(OPENAI_KEY) if OPENAI_KEY else 0}")
        return

    print(f"✅ OpenAI API key detected (length: {len(OPENAI_KEY)})")

    # Initialize RAG system
    rag_system = ColabDriveRAG(OPENAI_KEY)

    # Try to find the Rag folder automatically
    rag_folder_path = rag_system.find_rag_folder()

    if not rag_folder_path:
        print("❌ Could not find 'Rag' folder automatically.")
        print("\nLet me show you all the folders in your Google Drive:")
        rag_system.list_all_folders()

        print("\n" + "="*60)
        print("MANUAL SETUP REQUIRED")
        print("="*60)
        print("Please check the folder list above and:")
        print("1. Make sure you have a folder named 'Rag' (or 'rag') in your Google Drive")
        print("2. Put your PDF files in that folder")
        print("3. Or modify the code below to use the correct path")
        print("\nExample: If your folder is at '/content/drive/MyDrive/MyFolder/Rag'")
        print("Change the line below to:")
        print("  rag_folder_path = '/content/drive/MyDrive/MyFolder/Rag'")

        # You can manually set the path here if needed
        # rag_folder_path = "/content/drive/MyDrive/YourActualFolderName"
        return

    try:
        # Load documents
        print("\n📁 Loading documents from Google Drive...")
        documents = rag_system.load_documents_from_path(rag_folder_path)

        # Create vector store
        print("\n🔍 Creating vector store...")
        rag_system.create_vector_store(documents)

        # Setup RAG chain
        print("\n⚙️ Setting up RAG chain...")
        rag_system.setup_rag_chain()

        # Test with queries
        test_queries = [
            "What is the primary goal of the SELF-ROUTE method proposed by Zhuowan Li?",
            "Explain why the researchers believe RAG might still be useful despite the superior performance of long-context LLMs",
            "Compare the reranking techniques mentioned in the Wang paper. How do they impact the retrieval quality?",
            "What are the trade-offs involved when using different chunking strategies in RAG systems?",
            "How does multimodal retrieval enhance the capabilities of RAG?",
            "What were the key failure cases for RAG in handling long context retrievals, as noted by Zhuowan Li?",
            "Why does the Zhuowan paper claim that long-context LLMs outperformed RAG in most cases? What benefits does RAG still offer?",
            "Describe the metrics used to evaluate the different embedding models for RAG in Wang's paper",
            "Discuss the implications of using self-reflection in routing queries between RAG and long-context LLMs",
            "How does query rewriting contribute to the overall efficiency of RAG according to Wang's findings?",
            "Compare the cost-efficiency and performance trade-offs between RAG and long-context language models (LC) as discussed in the Wang and Zhuowan Li papers. How do these methods balance the ability to handle large volumes of text with computational demands?",
            "In terms of chunking methods in Wang's paper, what is the difference in performance between the best and second-best methods in Table 4?",
            "What are the best approaches for the retrieval and reranking modules according to Table 11 in Wang paper?"
        ]

        print("\n" + "="*80)
        print("🚀 TESTING RAG SYSTEM")
        print("="*80)

        for i, query in enumerate(test_queries, 1):
            print(f"\n{'-'*60}")
            print(f"📝 QUERY {i}: {query}")
            print(f"{'-'*60}")

            try:
                result = rag_system.query(query)
                print(f"\n💡 ANSWER: {result['answer']}")

                print(f"\n📚 SOURCES:")
                for source in result['sources']:
                    print(f"  [{source['index']}] {source['source']}, Page {source['page']}")

            except Exception as e:
                print(f"❌ ERROR: {e}")

    except Exception as e:
        print(f"❌ ERROR: {e}")

# Cell 7: Run the system
if __name__ == "__main__":
    main()

# Cell 8: Optional - Manual folder specification
# If the automatic detection doesn't work, uncomment and modify this:
"""
# Manual setup if automatic detection fails
rag_system = ColabDriveRAG(OPENAI_KEY)

# Replace this path with your actual folder path
manual_folder_path = "/content/drive/MyDrive/Rag"

try:
    documents = rag_system.load_documents_from_path(manual_folder_path)
    rag_system.create_vector_store(documents)
    rag_system.setup_rag_chain()

    # Test a single query
    result = rag_system.query("What is the main topic of these documents?")
    print(f"Answer: {result['answer']}")

except Exception as e:
    print(f"Error: {e}")
"""

✅ OpenAI API key detected (length: 164)
Looking for Rag folder in Google Drive...
Folder not found in common locations. Searching...
✓ Found Rag folder at: /content/drive/MyDrive/RAG

📁 Loading documents from Google Drive...
Found 2 PDF files:
  - Zhuowan_et_al.pdf (0.62 MB)
  - Wang_et_al.pdf (0.87 MB)

Processing: Zhuowan_et_al.pdf
✓ Loaded 12 pages from Zhuowan_et_al.pdf

Processing: Wang_et_al.pdf
✓ Loaded 22 pages from Wang_et_al.pdf

✓ Total pages loaded: 34

🔍 Creating vector store...
Total number of chunks after splitting: 165
Average chunk size: 879 characters
Chunk size range: 206 - 999 characters
Creating embeddings... (this may take a moment)
✓ Vector store created with 495 embeddings

⚙️ Setting up RAG chain...




✓ RAG chain setup complete!

🚀 TESTING RAG SYSTEM

------------------------------------------------------------
📝 QUERY 1: What is the primary goal of the SELF-ROUTE method proposed by Zhuowan Li?
------------------------------------------------------------





💡 ANSWER: The primary goal of the SELF-ROUTE method proposed by Zhuowan Li is to combine RAG (Retrieval-Augmented Generation) and LC (Long-Context) to reduce computational costs while maintaining a performance comparable to LC. The method utilizes LLM (Language Model) to route queries based on self-reflection, under the assumption that LLMs are well-calibrated in predicting whether a query is answerable given the provided context. The method consists of two steps: a RAG-and-Route step and a long-context prediction step.

📚 SOURCES:
  [1] Zhuowan_et_al.pdf, Page 3
  [2] Zhuowan_et_al.pdf, Page 3
  [3] Zhuowan_et_al.pdf, Page 3
  [4] Zhuowan_et_al.pdf, Page 4
  [5] Zhuowan_et_al.pdf, Page 4
  [6] Zhuowan_et_al.pdf, Page 4

------------------------------------------------------------
📝 QUERY 2: Explain why the researchers believe RAG might still be useful despite the superior performance of long-context LLMs
------------------------------------------------------------





💡 ANSWER: The researchers believe RAG might still be useful despite the superior performance of long-context LLMs because RAG has a significantly lower computational cost. RAG decreases the input length to LLMs, which leads to reduced costs, especially considering that LLM API pricing is typically based on the amount of information processed. Furthermore, RAG can help avoid the distraction of irrelevant information and save unnecessary attention computations.

📚 SOURCES:
  [1] Zhuowan_et_al.pdf, Page 0
  [2] Zhuowan_et_al.pdf, Page 0
  [3] Zhuowan_et_al.pdf, Page 0
  [4] Zhuowan_et_al.pdf, Page 0
  [5] Zhuowan_et_al.pdf, Page 0
  [6] Zhuowan_et_al.pdf, Page 0

------------------------------------------------------------
📝 QUERY 3: Compare the reranking techniques mentioned in the Wang paper. How do they impact the retrieval quality?
------------------------------------------------------------





💡 ANSWER: The Wang paper discusses two reranking techniques: DLM Reranking and TILDE Reranking. DLM Reranking uses deep language models to classify document relevancy to a query as "true" or "false", and documents are ranked based on the probability of the "true" token. This method can be time-intensive but offers better performance. On the other hand, TILDE Reranking focuses on query likelihoods, achieving efficiency by precomputing and storing the likelihood of query terms and ranking documents based on their sum. These techniques enhance the relevance of retrieved documents, ensuring the most pertinent information appears at the top.

📚 SOURCES:
  [1] Wang_et_al.pdf, Page 7
  [2] Wang_et_al.pdf, Page 7
  [3] Wang_et_al.pdf, Page 7
  [4] Wang_et_al.pdf, Page 2
  [5] Wang_et_al.pdf, Page 2
  [6] Wang_et_al.pdf, Page 2

------------------------------------------------------------
📝 QUERY 4: What are the trade-offs involved when using different chunking strategies in RAG systems?
-----




💡 ANSWER: The trade-offs involved when using different chunking strategies in RAG systems can include balancing performance and efficiency. Different strategies may be more suitable for different application scenarios, where efficiency might be prioritized over performance, or vice versa. However, the specific trade-offs can depend on the effectiveness and influence of the chunking techniques used within the chunking module.

📚 SOURCES:
  [1] Wang_et_al.pdf, Page 13
  [2] Wang_et_al.pdf, Page 13
  [3] Wang_et_al.pdf, Page 13
  [4] Wang_et_al.pdf, Page 1
  [5] Wang_et_al.pdf, Page 1
  [6] Wang_et_al.pdf, Page 1

------------------------------------------------------------
📝 QUERY 5: How does multimodal retrieval enhance the capabilities of RAG?
------------------------------------------------------------





💡 ANSWER: The integration of multimodal retrieval techniques enhances the capabilities of Retrieval-Augmented Generation (RAG) by substantially improving question-answering capabilities on visual inputs. It also speeds up the generation of multimodal content through a strategy of "retrieval as generation". This approach enhances both the performance and efficiency of RAG.

📚 SOURCES:
  [1] Wang_et_al.pdf, Page 2
  [2] Wang_et_al.pdf, Page 2
  [3] Wang_et_al.pdf, Page 2
  [4] Wang_et_al.pdf, Page 0
  [5] Wang_et_al.pdf, Page 0
  [6] Wang_et_al.pdf, Page 0

------------------------------------------------------------
📝 QUERY 6: What were the key failure cases for RAG in handling long context retrievals, as noted by Zhuowan Li?
------------------------------------------------------------





💡 ANSWER: The provided context does not specify the key failure cases for RAG in handling long context retrievals as noted by Zhuowan Li.

📚 SOURCES:
  [1] Zhuowan_et_al.pdf, Page 1
  [2] Zhuowan_et_al.pdf, Page 1
  [3] Zhuowan_et_al.pdf, Page 1
  [4] Zhuowan_et_al.pdf, Page 1
  [5] Zhuowan_et_al.pdf, Page 1
  [6] Zhuowan_et_al.pdf, Page 1

------------------------------------------------------------
📝 QUERY 7: Why does the Zhuowan paper claim that long-context LLMs outperformed RAG in most cases? What benefits does RAG still offer?
------------------------------------------------------------





💡 ANSWER: The Zhuowan paper claims that long-context LLMs outperformed RAG in most cases because recent advancements in LLMs have led to stronger long-context understanding capabilities. When sufficiently resourced, LC consistently outperforms RAG in almost all settings. However, RAG still offers the benefit of significantly lower computational cost, as it decreases the input length to LLMs, leading to reduced costs.

📚 SOURCES:
  [1] Zhuowan_et_al.pdf, Page 0
  [2] Zhuowan_et_al.pdf, Page 0
  [3] Zhuowan_et_al.pdf, Page 0
  [4] Zhuowan_et_al.pdf, Page 0
  [5] Zhuowan_et_al.pdf, Page 0
  [6] Zhuowan_et_al.pdf, Page 0

------------------------------------------------------------
📝 QUERY 8: Describe the metrics used to evaluate the different embedding models for RAG in Wang's paper
------------------------------------------------------------





💡 ANSWER: The provided context does not contain information about the specific metrics used to evaluate the different embedding models for RAG in Wang's paper.

📚 SOURCES:
  [1] Zhuowan_et_al.pdf, Page 1
  [2] Zhuowan_et_al.pdf, Page 1
  [3] Zhuowan_et_al.pdf, Page 1
  [4] Wang_et_al.pdf, Page 13
  [5] Wang_et_al.pdf, Page 13
  [6] Wang_et_al.pdf, Page 13

------------------------------------------------------------
📝 QUERY 9: Discuss the implications of using self-reflection in routing queries between RAG and long-context LLMs
------------------------------------------------------------





💡 ANSWER: Using self-reflection in routing queries between RAG and long-context LLMs effectively combines the strengths of both, achieving performance comparable to LC but at a significantly reduced cost. This approach leverages RAG for most queries, reserving the more computationally expensive LC for a subset of queries where it excels. This method, called SELF-ROUTE, uses the LLM itself to route queries based on self-reflection, under the assumption that LLMs are well-calibrated in predicting whether a query is answerable given the provided context.

📚 SOURCES:
  [1] Zhuowan_et_al.pdf, Page 5
  [2] Zhuowan_et_al.pdf, Page 5
  [3] Zhuowan_et_al.pdf, Page 5
  [4] Zhuowan_et_al.pdf, Page 3
  [5] Zhuowan_et_al.pdf, Page 3
  [6] Zhuowan_et_al.pdf, Page 3

------------------------------------------------------------
📝 QUERY 10: How does query rewriting contribute to the overall efficiency of RAG according to Wang's findings?
------------------------------------------------------------





💡 ANSWER: According to Wang's findings, query rewriting contributes to the overall efficiency of RAG by being used as a method for retrieving relevant documents for an input query. The rewritten queries are used for retrieval, which significantly impacts both the effectiveness and efficiency of RAG systems. This approach is one of the various methods that can be employed in the RAG workflow.

📚 SOURCES:
  [1] Zhuowan_et_al.pdf, Page 5
  [2] Zhuowan_et_al.pdf, Page 5
  [3] Zhuowan_et_al.pdf, Page 5
  [4] Wang_et_al.pdf, Page 1
  [5] Wang_et_al.pdf, Page 1
  [6] Wang_et_al.pdf, Page 1

------------------------------------------------------------
📝 QUERY 11: Compare the cost-efficiency and performance trade-offs between RAG and long-context language models (LC) as discussed in the Wang and Zhuowan Li papers. How do these methods balance the ability to handle large volumes of text with computational demands?
------------------------------------------------------------





💡 ANSWER: The Wang and Zhuowan Li papers found that long-context language models (LC) consistently outperform RAG in almost all settings when sufficiently resourced, indicating superior progress in long-context understanding. However, despite its suboptimal performance, RAG remains relevant due to its significantly lower computational cost. RAG significantly decreases the input length to LLMs, leading to reduced costs, which is beneficial when handling large volumes of text.

📚 SOURCES:
  [1] Zhuowan_et_al.pdf, Page 0
  [2] Zhuowan_et_al.pdf, Page 0
  [3] Zhuowan_et_al.pdf, Page 0
  [4] Zhuowan_et_al.pdf, Page 1
  [5] Zhuowan_et_al.pdf, Page 1
  [6] Zhuowan_et_al.pdf, Page 1

------------------------------------------------------------
📝 QUERY 12: In terms of chunking methods in Wang's paper, what is the difference in performance between the best and second-best methods in Table 4?
------------------------------------------------------------





💡 ANSWER: The provided context does not include specific information about the performance difference between the best and second-best chunking methods in Table 4 of Wang's paper.

📚 SOURCES:
  [1] Wang_et_al.pdf, Page 5
  [2] Wang_et_al.pdf, Page 5
  [3] Wang_et_al.pdf, Page 4
  [4] Wang_et_al.pdf, Page 4
  [5] Wang_et_al.pdf, Page 4
  [6] Wang_et_al.pdf, Page 4

------------------------------------------------------------
📝 QUERY 13: What are the best approaches for the retrieval and reranking modules according to Table 11 in Wang paper?
------------------------------------------------------------





💡 ANSWER: The context does not provide information on the best approaches for the retrieval and reranking modules according to Table 11 in the Wang paper.

📚 SOURCES:
  [1] Wang_et_al.pdf, Page 7
  [2] Wang_et_al.pdf, Page 7
  [3] Wang_et_al.pdf, Page 7
  [4] Wang_et_al.pdf, Page 2
  [5] Wang_et_al.pdf, Page 2
  [6] Wang_et_al.pdf, Page 2


'\n# Manual setup if automatic detection fails\nrag_system = ColabDriveRAG(OPENAI_KEY)\n\n# Replace this path with your actual folder path\nmanual_folder_path = "/content/drive/MyDrive/Rag"\n\ntry:\n    documents = rag_system.load_documents_from_path(manual_folder_path)\n    rag_system.create_vector_store(documents)\n    rag_system.setup_rag_chain()\n    \n    # Test a single query\n    result = rag_system.query("What is the main topic of these documents?")\n    print(f"Answer: {result[\'answer\']}")\n    \nexcept Exception as e:\n    print(f"Error: {e}")\n'