**Project Name**: PDF RAG Assistant

**Objective:**

To create an AI-powered system that can read any PDF document uploaded by the user and answer questions about it accurately, using a combination of:

Semantic retrieval (to find relevant sections of the PDF)

Large language model (LLM) (to generate human-like answers from the retrieved context)

Essentially, this is a Retrieval-Augmented Generation (RAG) system for PDFs.

## **Install necessary libraries**

In [None]:
!pip install pypdf
!pip install PyPDF2
from PyPDF2 import PdfReader
import io
!pip install langchain openai faiss-cpu





# File uploading

In [None]:
from IPython.display import display
from ipywidgets import FileUpload

uploader = FileUpload(accept='.pdf', multiple=False)
display(uploader)


FileUpload(value={}, accept='.pdf', description='Upload')

# Read PDF text Split text into chunks

In [None]:
pdf_file = list(uploader.value.values())[0]
pdf_bytes = pdf_file['content']
from PyPDF2 import PdfReader
import io

reader = PdfReader(io.BytesIO(pdf_bytes))
pdf_text = ""

for i, page in enumerate(reader.pages):
    pdf_text += page.extract_text() + "\n"

print(pdf_text[:500])


 Smart Shopping Cart using RFID Technology for 
Automated Billing and Enhanced Retail Experience  
1*Rathiya R   
Department of Information 
Technnology  
Dr.N.G.P. Institute of Technology  
Coimbatore , India  
*vr.rathiya@gmail.com  
4Bharath Kumar P  
Department of Information Technology  
Dr.N.G.P. Institute of Technology  
Coimbatore , India  
24205011@drngpit.ac.in 2Madhumitha  S 
Department of Information  
Technology  
Dr.N.G.P. Institute of Technology  
Coimbatore , India  
24205058@drn


In [None]:
chunk_size = 500
overlap = 50

chunks = []
for i in range(0, len(pdf_text), chunk_size - overlap):
    chunks.append(pdf_text[i:i+chunk_size])

print(f"Total chunks: {len(chunks)}")
print(chunks[:2])

Total chunks: 52
[' Smart Shopping Cart using RFID Technology for \nAutomated Billing and Enhanced Retail Experience  \n1*Rathiya R   \nDepartment of Information \nTechnnology  \nDr.N.G.P. Institute of Technology  \nCoimbatore , India  \n*vr.rathiya@gmail.com  \n4Bharath Kumar P  \nDepartment of Information Technology  \nDr.N.G.P. Institute of Technology  \nCoimbatore , India  \n24205011@drngpit.ac.in 2Madhumitha  S \nDepartment of Information  \nTechnology  \nDr.N.G.P. Institute of Technology  \nCoimbatore , India  \n24205058@drn', ' of Technology  \nCoimbatore , India  \n24205058@drngpit.ac.in  \n5Chith irika K S \nDepartment of Information Technology  \nDr.N.G.P. Institute of Technology  \nCoimbatore , India  \n24205014@drngpit.ac.in 3Elakiya VL  \nDepartment of Information  \nTechnology  \nDr.N.G.P. Institute of Technology                 \nCoimbatore , India  \n24205022@drngpit.ac.in  \n6Archana  R \nDepartment of Information Technology  \nDr.N.G.P. Institute of Technology  \nCoim

# Convert chunks into embeddings

In [None]:
!pip install --upgrade langchain langchain-community openai faiss-cpu


Collecting langchain
  Downloading langchain-1.0.8-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting openai
  Downloading openai-2.8.1-py3-none-any.whl.metadata (29 kB)
Collecting langchain-core<2.0.0,>=1.0.6 (from langchain)
  Downloading langchain_core-1.0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting langgraph<1.1.0,>=1.0.2 (from langchain)
  Downloading langgraph-1.0.3-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7-

In [51]:
from langchain_community.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Store embeddings in FAISS

In [None]:
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer

# Load a lightweight local model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode chunks locally (no API, no quota)
chunk_embeddings = model.encode(chunks)

import faiss
import numpy as np

embedding_matrix = np.array(chunk_embeddings).astype("float32")
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embedding_matrix)

print(f"FAISS index contains {index.ntotal} vectors")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index contains 52 vectors


# Test retrieval with a simple query

In [None]:
# Example user query
query = "what is smart shopping cart"

query_embedding = model.encode([query]).astype("float32")

k = 3
distances, indices = index.search(query_embedding, k)
retrieved_chunks = [chunks[idx] for idx in indices[0]]

print("Top retrieved chunks:")
for i, chunk in enumerate(retrieved_chunks):
    print(f"\nChunk {i+1}: {chunk[:200]}...")


Top retrieved chunks:

Chunk 1: t applicable in 
supermarkets, hypermarkets, and departmental stores. The 
Smart Shopping Cart provides a moderate opportunity to the 
current smart retail systems. Although certain solutions like 
th...

Chunk 2:  that is easier to deploy and administer. 
Altogether, the Smart Shopping Cart system is being 
presented in the form of a smart solution which will help to 
overcome the most significant disadvantage...

Chunk 3: is system, the smart carts are to be equipped to automatically identify the 
products loaded in it, and then compute the total bill and 
allow the customer to make the payment online, without 
necessa...


# Embedding & Vector Indexing Pipeline and RAG Retriever

In [52]:
from sentence_transformers import SentenceTransformer

# Local embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
chunk_embeddings = model.encode(chunks)  # 'chunks' from your PDF
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Split PDF into chunks (example)
chunks = ["chunk1 text...", "chunk2 text...", "chunk3 text..."]

# Encode locally
model = SentenceTransformer('all-MiniLM-L6-v2')
chunk_embeddings = model.encode(chunks)

# FAISS index
embedding_matrix = np.array(chunk_embeddings).astype("float32")
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embedding_matrix)


In [None]:
!pip install --upgrade langchain faiss-cpu sentence-transformers openai




In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

chunks = ["chunk1 text...", "chunk2 text..."]
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks)

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings).astype("float32"))

def retrieve_chunks(query, index, chunks, model, top_k=3):
    # Embed the query
    query_embedding = model.encode([query])

    # Search FAISS index
    D, I = index.search(np.array(query_embedding).astype("float32"), top_k)

    # Return the corresponding chunks
    retrieved = [chunks[i] for i in I[0]]
    return retrieved


# PDF Question-Answering Engine with Local RAG

In [53]:
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import io
from IPython.display import display, clear_output
import ipywidgets as widgets

# ------------------------------
# Load models
# ------------------------------
embed_model = SentenceTransformer('all-MiniLM-L12-v2')  # stronger embeddings
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
llm_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

# ------------------------------
# Global variables
# ------------------------------
chunks = []
index = None
debug_mode = False  # set True to see retrieved chunks

# ------------------------------
# PDF upload widget
# ------------------------------
upload_widget = widgets.FileUpload(
    accept='.pdf',
    multiple=False,
    description='Upload PDF'
)
display(upload_widget)

# ------------------------------
# Question input + output
# ------------------------------
question_widget = widgets.Text(
    description='Your Question:',
    placeholder='Type your question and press Enter...'
)
answer_output = widgets.Output()
display(question_widget, answer_output)

# ------------------------------
# PDF processing functions
# ------------------------------
def process_pdf(file_bytes):
    reader = PdfReader(io.BytesIO(file_bytes))
    pdf_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            pdf_text += text
    return pdf_text

def split_into_chunks(text, chunk_size=500, overlap=50):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap)]

def create_faiss_index(chunks_local):
    embeddings = embed_model.encode(chunks_local)
    embedding_matrix = np.array(embeddings).astype("float32")
    dim = embedding_matrix.shape[1]
    faiss_index = faiss.IndexFlatL2(dim)
    faiss_index.add(embedding_matrix)
    return faiss_index

def answer_question(query, chunks_local, faiss_index):
    # Retrieve top-5 chunks
    query_emb = embed_model.encode([query]).astype("float32")
    k = 5
    _, indices = faiss_index.search(query_emb, k)
    retrieved_chunks = [chunks_local[i] for i in indices[0]]
    unique_chunks = list(dict.fromkeys(retrieved_chunks))
    sorted_chunks = sorted(unique_chunks, key=lambda x: chunks_local.index(x))
    context = "\n".join(sorted_chunks)

    if debug_mode:
        print("\n--- Retrieved Chunks ---")
        for i, c in enumerate(sorted_chunks):
            print(f"[{i}] {c}\n---")

    # Prompt improvement: instruct model to reply only if info exists
    prompt = f"""
    You are an expert assistant. Using the context below, answer the question concisely.
    If the answer is not in the context, respond with 'Answer not found in the document'.

    Context:
    {context}

    Question: {query}
    """
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    outputs = llm_model.generate(**inputs, max_new_tokens=256)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ------------------------------
# Callbacks
# ------------------------------
def on_upload_change(change):
    global chunks, index

    if len(upload_widget.value) == 0:
        return  # no file uploaded yet

    # Safely get uploaded file
    uploaded_file = next(iter(upload_widget.value.values()))
    file_bytes = uploaded_file.get('content')
    if not file_bytes:
        return

    pdf_text = process_pdf(file_bytes)
    chunks[:] = split_into_chunks(pdf_text)  # update global
    index_obj = create_faiss_index(chunks)

    global index
    index = index_obj

    with answer_output:
        clear_output()
        print("✅ PDF processed! You can now ask questions below.")

def on_question_submit(sender):
    query = sender.value.strip()
    if query == "":
        return
    with answer_output:
        print(f"\nQ: {query}")
        answer = answer_question(query, chunks, index)
        print(f"A: {answer}\n")
    sender.value = ""  # clear input for next question

# ------------------------------
# Link callbacks
# ------------------------------
upload_widget.observe(on_upload_change, names='value')
question_widget.on_submit(on_question_submit)


FileUpload(value={}, accept='.pdf', description='Upload PDF')

Text(value='', description='Your Question:', placeholder='Type your question and press Enter...')

Output()