<a href="https://colab.research.google.com/github/Chinmaysahoo03/RAG_Document_QA/blob/main/RAG_Docs_QA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain-groq langchain sentence-transformers faiss-cpu pypdf
!pip install langchain_community

In [None]:
import os
from google.colab import userdata, files
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.llms import HuggingFacePipeline
from langchain.schema import Document
import warnings
warnings.filterwarnings("ignore")

# Groq Setup
groq_api = userdata.get('groq_api')
llm = ChatGroq(api_key=groq_api, model="gemma2-9b-it", temperature=0.1)

# Embeddings (free Hugging Face model)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print("Setup complete!")

In [None]:
# Function to load docs (PDF or text)
def load_documents(file_path):
    if file_path.endswith('.pdf'):
        loader = PyPDFLoader(file_path)
    else:
        loader = TextLoader(file_path)
    docs = loader.load()
    return docs

# Sample: Upload a file (run this to upload)
uploaded = files.upload()  # Upload a PDF or TXT (e.g., INFOSYS -APTITUDE-MODEL PAPERS_1473177928759.pdf)
file_name = list(uploaded.keys())[0]  # Get first uploaded file
docs = load_documents(file_name)
print(f"Loaded {len(docs)} documents.")

# Split into chunks and clean
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Filter out empty or invalid chunks
cleaned_splits = [doc for doc in splits if doc.page_content and doc.page_content.strip()]
print(f"Split into {len(cleaned_splits)} valid chunks after cleaning.")

In [None]:
# Create FAISS vector store
vectorstore = FAISS.from_documents(splits, embeddings)

# Save index (optional, for reuse)
vectorstore.save_local("faiss_index")

print("Vector store built! Ready for retrieval.")

In [None]:
# Prompt template for Q&A with citations
prompt_template = """
Use the following context to answer the question. If you don't know, say so. Cite sources with [source chunk index].

Context: {context}

Question: {question}

Answer:
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Retrieval QA Chain (top-3 chunks)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

print("RAG chain ready!")

In [None]:
# Interactive loop
print("RAG Q&A System Ready! Type 'quit' to exit.")

while True:
    query = input("\nQuestion: ")
    if query.lower() == 'quit':
        break

    # Run query
    result = qa_chain({"query": query})
    answer = result['result']
    sources = result['source_documents']

    print(f"\nAnswer: {answer}")
    print("\nSources:")
    for i, doc in enumerate(sources):
        print(f"[Chunk {i+1}]: {doc.page_content[:200]}... (Page: {getattr(doc.metadata, 'page', 'N/A')})")