# InsightPDF: A RAG-Based PDF Chatbot  
> A chatbot that lets you ask questions about any PDF using Groq's LLaMA-3 and ChromaDB for smart retrieval and answers.


In [10]:
# Importing Libraries

import os
from getpass import getpass
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma 
import chromadb 
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_groq import ChatGroq
from langchain.schema import Document
import warnings

warnings.filterwarnings('ignore')

llm = None
vectorstore = None
conversation_chain = None

In [None]:
# Groq API setup

def setup_groq_api():
    llm = ChatGroq(
        groq_api_key='gxxxx', # Replace with your Groq API key
        model_name="llama-3.3-70b-versatile",  
        temperature=0.1,
        max_tokens=1024
    )
    
    return llm

In [None]:
#  Loading and Extracting text from PDF

def load_pdf(pdf_path): 
    documents = []
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n"

            doc = Document(page_content=text, metadata={"source": pdf_path})
            documents.append(doc)

    except Exception as e:
        print(f"Error loading PDF: {str(e)}")
        return None

    print(f"PDF loaded successfully! Extracted {len(text)} characters")
    return documents

In [13]:
# Creating Chroma vector store from documents

def create_vectorstore(documents):
    global vectorstore
    print("Creating vector embeddings with Chroma...")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )

    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} text chunks")

    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    # Defining a directory for persistent storage
    persist_directory = "./chroma_db"
    if not os.path.exists(persist_directory):
        os.makedirs(persist_directory)

    # Initializing a persistent Chroma client
    client = chromadb.PersistentClient(path=persist_directory)
    collection_name = "pdf_chatbot_collection"

    # Creating Chroma vector store. (If the collection exists, it will be loaded. Otherwise, a new one is created.)
    vectorstore = Chroma.from_documents(
        chunks,
        embeddings,
        client=client,
        collection_name=collection_name
    )

    print(" Chroma vector store created successfully!")
    return vectorstore

In [14]:
# Retrieval chain setup

def setup_conversation_chain(current_llm, current_vectorstore):
    global conversation_chain
    print("ðŸ”— Setting up conversation chain...")

    if current_llm is None or current_vectorstore is None:
        print(" LLM or Vectorstore not initialized. Please run setup_groq_api() and create_vectorstore() first.")
        return None

    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True,
        output_key="answer"
    )

    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=current_llm,
        retriever=current_vectorstore.as_retriever(search_kwargs={"k": 4}),
        memory=memory,
        return_source_documents=True,
        verbose=False
    )

    print("Conversation chain setup complete!")
    return conversation_chain

In [15]:
# Initializing the complete chatbot.

def initialize_chatbot(pdf_path):
    global llm, vectorstore, conversation_chain
    print("Initializing PDF Chatbot...")

    # Step 1: Setup API
    llm = setup_groq_api()
    if llm is None:
        print("Initialization failed: Groq API setup issue.")
        return False

    # Step 2: Load PDF
    documents = load_pdf(pdf_path)
    if not documents:
        print("Initialization failed: PDF loading issue.")
        return False

    # Step 3: Create vector store
    vectorstore = create_vectorstore(documents)
    if vectorstore is None:
        print("Initialization failed: Vector store creation issue.")
        return False

    # Step 4: Setup chain
    conversation_chain = setup_conversation_chain(llm, vectorstore)
    if conversation_chain is None:
        print("Initialization failed: Conversation chain setup issue.")
        return False

    print("Chatbot initialized successfully!")
    print("You can now start chatting with your PDF!")
    return True

In [16]:
# Chat History

def get_chat_history():
    global conversation_chain
    if conversation_chain is None or not conversation_chain.memory:
        return "No chat history available."

    history = conversation_chain.memory.chat_memory.messages
    formatted_history = " **Chat History:**\n\n"

    for i, message in enumerate(history):
        role = "ðŸ§‘ User" if message.type == "human" else "ðŸ¤– Assistant"
        formatted_history += f"{role}: {message.content}\n\n"

    return formatted_history

In [None]:
def chat_with_pdf(question):
    global conversation_chain
    if conversation_chain is None:
        return "Chatbot not initialized. Please run initialize_chatbot() first."

    try:
        response = conversation_chain({"question": question})

        answer = response['answer']
        source_docs = response.get('source_documents', [])

        formatted_response = f"ðŸ¤– **Answer:** {answer}\n\n"

        if source_docs:
            formatted_response += "**Sources:**\n"
            for i, doc in enumerate(source_docs[:2], 1):
                preview = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
                source_info = doc.metadata.get("source", "Unknown Source")
                formatted_response += f"{i}. From: `{source_info}`\n   Content: `{preview}`\n\n"

        return formatted_response

    except Exception as e:
        return f"Error during chat: {str(e)}"

In [None]:
# Demo Usage

def demo_usage():
    print("PDF Chatbot Demo")
    print("=" * 50)

    pdf_path = input(" Enter the path to your PDF file (e.g., my_document.pdf): ")

    if initialize_chatbot(pdf_path):
        print("\n" + "=" * 50)
        print(" Chat Interface Ready!")
        print("Type 'quit' to exit, 'history' to see chat history")
        print("=" * 50)

        while True:
            question = input("\nðŸ§‘ You: ")

            if question.lower() == 'quit':
                print(" Goodbye!")
                break
            elif question.lower() == 'history':
                print(get_chat_history())
                continue

            response = chat_with_pdf(question)
            print(f"\n{response}")
    else:
        print(" Failed to initialize chatbot. Please check errors above.")

demo_usage()