In [1]:
! pip3 install PyPDF2 transformers torch chromadb sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting transformers
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading toke

In [2]:
import os
import torch
import chromadb
import PyPDF2
! pip install groq
from groq import Groq
from typing import List
from sentence_transformers import SentenceTransformer

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class PDFRAGSystem:
    def __init__(self,
                 embedding_model='all-MiniLM-L6-v2',
                 groq_model='mixtral-8x7b-32768',
                 api_key=None):
        """
        Initialize the PDF-based RAG system for Colab

        Args:
            embedding_model (str): Hugging Face embedding model
            groq_model (str): Groq LLM model to use
            api_key (str): Groq API Key
        """
        # Initialize Groq Client
        from google.colab import userdata
        from getpass import getpass

        if api_key is None:
            try:
                # Option 1: Using Colab Secrets
                api_key = userdata.get('GROQ_API_KEY')
            except:
                # Option 2: Manual input
                api_key = getpass("Enter your Groq API Key: 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 ")

        self.groq_client = Groq(api_key=api_key)

        # Initialize Embedding Model
        self.embedding_model = SentenceTransformer(embedding_model)

        # Initialize ChromaDB Client with persistent storage
        self.chroma_client = chromadb.PersistentClient(path="./chroma_db")

        # Groq LLM Model
        self.groq_model = groq_model

    def upload_pdf(self, pdf_path: str) -> List[str]:
        """
        Upload PDF and extract text

        Args:
            pdf_path (str): Path to the PDF file

        Returns:
            List of text chunks from PDF
        """
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text_chunks = []

            for page in pdf_reader.pages:
                # Split text into chunks (e.g., 500 characters)
                page_text = page.extract_text()
                chunks = [
                    page_text[i:i+500]
                    for i in range(0, len(page_text), 500)
                ]
                text_chunks.extend(chunks)

        return text_chunks

    def create_embeddings(self, text_chunks: List[str]) -> List[List[float]]:
        """
        Convert text chunks to embeddings

        Args:
            text_chunks (List[str]): List of text chunks

        Returns:
            List of embeddings
        """
        return [
            self.embedding_model.encode(chunk).tolist()
            for chunk in text_chunks
        ]

    def save_to_chromadb(self,
                          text_chunks: List[str],
                          embeddings: List[List[float]],
                          collection_name: str = None):
        """
        Save text chunks and embeddings to ChromaDB

        Args:
            text_chunks (List[str]): Text chunks
            embeddings (List[List[float]]): Embeddings for text chunks
            collection_name (str): Name of ChromaDB collection

        Returns:
            ChromaDB collection
        """
        # Generate a unique collection name if not provided
        if collection_name is None:
            collection_name = f"pdf_collection_{len(text_chunks)}"

        # Delete existing collection if it exists
        try:
            self.chroma_client.delete_collection(name=collection_name)
        except:
            pass

        # Create new collection
        collection = self.chroma_client.create_collection(name=collection_name)

        # Add chunks and embeddings
        for idx, (chunk, embedding) in enumerate(zip(text_chunks, embeddings)):
            collection.add(
                ids=[str(idx)],
                embeddings=embedding,
                documents=[chunk]
            )

        return collection

    def retrieve_context(self,
                         query: str,
                         collection,
                         top_k: int = 3):
        """
        Retrieve relevant context for a query

        Args:
            query (str): User query
            collection: ChromaDB collection
            top_k (int): Number of top results to retrieve

        Returns:
            List of most relevant text chunks
        """
        query_embedding = self.embedding_model.encode(query).tolist()

        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k
        )

        return results['documents'][0]

    def generate_response(self,
                          query: str,
                          context: List[str]) -> str:
        """
        Generate response using Groq LLM

        Args:
            query (str): User query
            context (List[str]): Retrieved context

        Returns:
            Generated response
        """
        # Construct context-aware prompt
        context_str = "\n".join(context)
        prompt = f"""
        Context: {context_str}

        Question: {query}

        Based on the context, provide a detailed and accurate answer.
        """

        chat_completion = self.groq_client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful AI assistant."},
                {"role": "user", "content": prompt}
            ],
            model=self.groq_model
        )

        return chat_completion.choices[0].message.content

In [4]:
# Colab Usage Example
def main():
    # Option 1: Manually upload PDF in Colab
    from google.colab import files
    print("Please upload your PDF file:")
    uploaded = files.upload()
    pdf_path = list(uploaded.keys())[0]  # Get the first uploaded file

    # Initialize RAG System
    rag_system = PDFRAGSystem()

    # Step 1: Upload PDF
    text_chunks = rag_system.upload_pdf(pdf_path)

    # Step 2: Create Embeddings
    embeddings = rag_system.create_embeddings(text_chunks)

    # Step 3: Save to ChromaDB (with automatic unique collection name)
    collection = rag_system.save_to_chromadb(text_chunks, embeddings)

    # Step 4: Interactive Retrieval and Generation
    while True:
        query = input("Ask a question (or type 'exit'): ")
        if query.lower() == 'exit':
            break

        # Retrieve context
        context = rag_system.retrieve_context(query, collection)

        # Generate response
        response = rag_system.generate_response(query, context)
        print("Response:", response)

# Uncomment to run
main()

ModuleNotFoundError: No module named 'google.colab'