# PageWhisper

### A RAG (Retrieval-Augmented Generation) based system that allows users to upload PDFs and ask intelligent questions, generating context-aware answers using AI

#### Importing the required libraries and frameworks

In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import os
from ollama import Client
import ipywidgets as widgets
from IPython.display import display, clear_output
import time

#### Function to extract text from a PDF file

In [2]:
def extract_pdf(pdf_path, status):
    status.value = "Extracting text from PDF..."
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    full_text = "\n".join(page.page_content for page in pages)
    text_file_path = "extracted_text.txt"
    with open(text_file_path, "w", encoding="utf-8") as f:
        f.write(full_text)
    status.value = "✅ Text extracted from PDF."
    return text_file_path

#### Function to split the text file into chunks

In [3]:
def chunking(text_file_path, status):
    status.value = "Splitting text into chunks..."
    with open(text_file_path, "r", encoding="utf-8") as file:
        full_text = file.read()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=700,
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    chunks = text_splitter.split_text(full_text)
    status.value = f"✅ Chunks created: {len(chunks)}"
    return chunks



#### Function for embedding text chunks 

In [4]:
def embed_chunks(chunks, status, model_name="all-MiniLM-L6-v2"):
    status.value = "Embedding chunks..."
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, show_progress_bar=True)
    status.value = f"✅ Embeddings generated: {len(embeddings)}"
    return embeddings

#### Function to store the embeddings in a vector database

In [5]:
def save_embeddings_faiss(embeddings, chunks, status, index_file="faiss_index.index"):
    status.value = "Saving embeddings to FAISS index..."
    embeddings = np.array(embeddings).astype("float32")
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    faiss.write_index(index, index_file)
    status.value = f"✅ FAISS index saved as {index_file}"
    return index

#### Function to send the user query along with retrieved chunks to the LLM for response generation

In [6]:
def ask_llm_with_context(query, relevant_chunks, status):
    client = Client(host='http://localhost:11434')
    status.value = "🤖 Asking the LLM..."
    context = "\n\n".join(relevant_chunks)
    prompt = f"""You are a helpful assistant. Use only the information provided in the context below to answer the question. 
If the context does not contain information relevant to the question, respond with:
"There is no content related to your query in the provided document."

Context:
{context}

Question: {query}

Answer:"""
    response = client.chat(model="llama3.2", messages=[
        {"role": "user", "content": prompt}
    ])
    
    status.value = "✅ Response generated!"
    return response['message']['content']


#### Basic UI for demonstration purposes using widgets

In [8]:
heading = widgets.HTML(
    "<div style='margin-bottom: 20px;'>"
    "<h1 style='font-size: 36px;'>PageWhisper</h1>"
    "<p style='font-size: 18px; color: #555;'>Ask context-aware questions from your uploaded PDFs using RAG + LLM magic</p>"
    "</div>"
)


spacer = widgets.Box(layout=widgets.Layout(height='30px'))
file_name_label = widgets.Label(value="📄 No file uploaded yet.")
pdf_upload = widgets.FileUpload(accept='.pdf', multiple=False)
query_input = widgets.Textarea(
    value='',
    placeholder='Ask your question...',
    description='Query:',
    layout=widgets.Layout(width='auto', height='100px')
)

run_button = widgets.Button(description="Run PDF QA", button_style='success')
status_widget = widgets.Label(value="")
output = widgets.Output()

query_input.layout.display = 'none'
run_button.layout.display = 'none'

chunks = []
index = None
pdf_name = ""

display(widgets.VBox([
    heading,
    file_name_label,
    pdf_upload,
    status_widget,
    query_input,
    spacer,
    run_button,
    output
]))



def on_pdf_upload_change(change):
    with output:
        clear_output()
        status_widget.value = "🔄 Processing PDF..."

        if len(pdf_upload.value) > 0:
            uploaded_file = pdf_upload.value[0] 
            pdf_name = uploaded_file['name']
            pdf_bytes = uploaded_file['content']
            file_name_label.value = f"📄 Uploaded File: {pdf_name}"


            with open(pdf_name, 'wb') as f:
                f.write(pdf_bytes)

 
            global chunks, index
            text_file = extract_pdf(pdf_name, status_widget)
            chunks = chunking(text_file, status_widget)
            embeddings = embed_chunks(chunks, status_widget)
            index = save_embeddings_faiss(embeddings, chunks, status_widget)

            status_widget.value = "✅ PDF processed! You can now ask your question."


            query_input.layout.display = 'block'
            run_button.layout.display = 'block'

def on_run_button_click(b):
    with output:
        clear_output()
        status_widget.value = ""

        if not index or not chunks:
            print("Please upload and process a PDF first.")
            return

        if query_input.value.strip():
            query = query_input.value.strip()
            status_widget.value = "Searching for relevant chunks..."


            embed_model = SentenceTransformer("all-MiniLM-L6-v2")
            query_embedding = embed_model.encode([query])
            D, I = index.search(np.array(query_embedding).astype("float32"), k=3)
            retrieved_chunks = [chunks[i] for i in I[0]]

            response = ask_llm_with_context(query, retrieved_chunks, status_widget)
            print("😎 Answer for your Query : \n",response)
        else:
            print("Enter a query to proceed.")

# Event Bindings
pdf_upload.observe(on_pdf_upload_change, names='value')
run_button.on_click(on_run_button_click)


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


VBox(children=(HTML(value="<div style='margin-bottom: 20px;'><h1 style='font-size: 36px;'>PageWhisper</h1><p s…