In [29]:
!pip show gradio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: gradio
Version: 3.39.0
Summary: Python library for easily interacting with trained machine learning models
Home-page: https://github.com/gradio-app/gradio
Author: 
Author-email: Abubakar Abid <team@gradio.app>, Ali Abid <team@gradio.app>, Ali Abdalla <team@gradio.app>, Dawood Khan <team@gradio.app>, Ahsen Khaliq <team@gradio.app>, Pete Allen <team@gradio.app>, Ömer Faruk Özdemir <team@gradio.app>
License: 
Location: /Users/aaron/Documents/code/Deep-Learning-AI/deeplearningai/lib/python3.9/site-packages
Requires: aiofiles, aiohttp, altair, fastapi, ffmpy, gradio-client, httpx, huggingface-hub, jinja2, markdown-it-py, markupsafe, matplotlib, mdit-py-plugins, numpy, orjson, packaging, pandas, pillow, pydantic, pydub, python-multipart, pyyaml, requests, semantic-version, typing-extensions, uvicorn, websockets
Required-by: 


In [30]:
!pip install gradio==3.39.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [31]:
%run ./utils.ipynb

✅ Successfully connected to MongoDB.


In [40]:
import gradio as gr
import os
import shutil
import fitz  # PyMuPDF
from pymongo import MongoClient
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict
import chromadb
from chromadb.config import Settings

# Ensure ./PDFs directory exists
os.makedirs("./PDFs", exist_ok=True)

# MongoDB setup
MONGO_URI = "mongodb+srv://Aaron:1234@cluster0.erwea75.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

def get_mongo_client(uri):
    return MongoClient(uri)

def get_collection(client, db_name, collection_name):
    return client[db_name][collection_name]

def insert_pdf_pages_to_mongo(collection, pdf_name, pages):
    for page in pages:
        if not collection.find_one({"pdf_name": pdf_name, "page_number": page["page_number"]}):
            collection.insert_one({
                "pdf_name": pdf_name,
                "page_number": page["page_number"],
                "text": page["text"]
            })

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    pages = []
    for page_number in range(len(doc)):
        page = doc.load_page(page_number)
        text = page.get_text().strip()
        pages.append({
            "page_number": page_number + 1,
            "text": text
        })
    doc.close()
    return pages

# Upload Logic
uploaded_pdfs = []

def upload_docs(files):
    saved_files = []
    for file in files:
        filename = os.path.basename(file.name)
        dest_path = os.path.join("./PDFs", filename)

        if os.path.abspath(file.name) != os.path.abspath(dest_path):
            shutil.copy(file.name, dest_path)
            saved_files.append(filename)
        else:
            saved_files.append(filename + " (already exists)")

    global uploaded_pdfs
    uploaded_pdfs = saved_files
    return f"Uploaded: {', '.join(saved_files)}"

# Mongo Upload Logic
def upload_to_mongo():
    if not uploaded_pdfs:
        return "❌ No PDFs uploaded yet."

    client = get_mongo_client(MONGO_URI)
    collection = get_collection(client, "pdf_rag_db", "pages")

    inserted = []
    skipped = []

    for pdf in uploaded_pdfs:
        full_path = os.path.join("./PDFs", pdf)

        if collection.find_one({"pdf_name": pdf}):
            skipped.append(pdf)
            continue

        pages = extract_text_from_pdf(full_path)
        insert_pdf_pages_to_mongo(collection, pdf, pages)
        inserted.append(pdf)

    result = ""
    if inserted:
        result += f"✅ Uploaded to MongoDB: {', '.join(inserted)}\n"
    if skipped:
        result += f"⚠️ Skipped (already in DB): {', '.join(skipped)}"
    return result.strip()

# Add this helper function to get list of uploaded PDFs
def get_uploaded_pdfs():
    return [f for f in os.listdir("./PDFs") if f.endswith(".pdf")]

# ------------------ CHUNKING LOGIC ------------------
def chunking(
    selected_pdf,
    db_name: str = "pdf_rag_db",
    collection_name: str = "pages",
    chunk_size: int = 500,
    chunk_overlap: int = 100
) -> str:
    client = get_mongo_client(MONGO_URI)
    pages_collection = get_collection(client, db_name, collection_name)
    documents = list(pages_collection.find({"pdf_name": selected_pdf}))
    if not documents:
        return f"❌ No pages found in MongoDB for {selected_pdf}."

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    chunked_docs = []
    for doc in documents:
        chunks = text_splitter.split_text(doc["text"])
        for idx, chunk in enumerate(chunks):
            chunked_docs.append({
                "pdf_name": doc["pdf_name"],
                "page_number": doc["page_number"],
                "chunk_index": idx,
                "chunk_text": chunk
            })

    if not chunked_docs:
        return "❌ No chunks created."

    chunk_collection = get_collection(client, db_name, "chunks")
    chunk_collection.insert_many(chunked_docs)
    return f"✅ Stored {len(chunked_docs)} chunks for {selected_pdf} in MongoDB."

def display_chunks(selected_pdf):
    client = get_mongo_client(MONGO_URI)
    collection = get_collection(client, "pdf_rag_db", "chunks")
    chunks = list(collection.find({"pdf_name": selected_pdf}, {"_id": 0}))
    return chunks if chunks else f"⚠️ No chunks found in MongoDB for {selected_pdf}."

# ------------------ EMBEDDING & CHROMADB ------------------
def store_embedded_chunks_to_chroma(selected_pdf):
    client = get_mongo_client(MONGO_URI)
    chunks_collection = get_collection(client, "pdf_rag_db", "chunks")

    # Fetch chunks for selected PDF
    chunks = list(chunks_collection.find({"pdf_name": selected_pdf}))
    if not chunks:
        return f"❌ No chunks found for {selected_pdf}. Please chunk it first."

    # Init Chroma client (local)
    chroma_client = chromadb.Client()
    chroma_collection = chroma_client.get_or_create_collection(name="pdf_chunks")

    # Prepare lists for new chunks only
    new_ids, new_docs, new_metadata = [], [], []
    existing_ids = set(chroma_collection.get(ids=None)["ids"])  # existing ids

    for chunk in chunks:
        chunk_id = f"{chunk['pdf_name']}_{chunk['page_number']}_{chunk['chunk_index']}"
        if chunk_id in existing_ids:
            continue  # Skip already embedded
        new_ids.append(chunk_id)
        new_docs.append(chunk["chunk_text"])
        new_metadata.append({
            "pdf_name": chunk["pdf_name"],
            "page_number": chunk["page_number"],
            "chunk_index": chunk["chunk_index"]
        })

    if not new_ids:
        return f"✅ All chunks of {selected_pdf} are already stored in ChromaDB."

    chroma_collection.add(
        ids=new_ids,
        documents=new_docs,
        metadatas=new_metadata
    )

    return f"✅ Embedded and stored {len(new_ids)} new chunks for {selected_pdf} in ChromaDB."

# ------------------ UI ------------------
with gr.Blocks() as demo:
    gr.Markdown("## 📄 PDF Upload + MongoDB + Knowledge Nuggets")

    with gr.Tab("Upload PDFs"):
        upload_input = gr.File(label="Upload PDFs", file_types=[".pdf"], file_count="multiple")
        upload_result = gr.Textbox(label="Upload Result")
        mongo_result = gr.Textbox(label="MongoDB Result")

        upload_btn = gr.Button("Upload to ./PDFs")
        mongo_btn = gr.Button("Upload Extracted Text to MongoDB")

        upload_btn.click(upload_docs, inputs=upload_input, outputs=upload_result)
        mongo_btn.click(upload_to_mongo, outputs=mongo_result)

    with gr.Tab("Knowledge Nuggets"):
        pdf_select_chunk = gr.Dropdown(label="Select PDF to Chunk", choices=get_uploaded_pdfs(), interactive=True)
        chunk_btn = gr.Button("🔪 Chunk PDF Pages")
        chunk_output = gr.Textbox(label="Chunking Result")

        pdf_select_display = gr.Dropdown(label="Select PDF to Display Chunks", choices=get_uploaded_pdfs(), interactive=True)
        display_btn = gr.Button("📚 Display Chunks")
        display_output = gr.JSON(label="Chunks from MongoDB")

        chunk_btn.click(chunking, inputs=pdf_select_chunk, outputs=chunk_output)
        display_btn.click(display_chunks, inputs=pdf_select_display, outputs=display_output)
        
    with gr.Tab("Embed to Chroma"):
        # PDF selection for embedding
        pdf_select_embed = gr.Dropdown(label="Select PDF to Embed", choices=get_uploaded_pdfs(), interactive=True)
        embed_btn = gr.Button("💾 Embed and Store in ChromaDB")
        embed_output = gr.Textbox(label="Embedding Result")
    
        # Show all chunks from ChromaDB collection
        view_btn = gr.Button("👁️ View All Chunks in ChromaDB")
        view_output = gr.Textbox(label="Chunks in ChromaDB")
    
        # Embed selected PDF into ChromaDB
        embed_btn.click(store_embedded_chunks_to_chroma, inputs=pdf_select_embed, outputs=embed_output)
    
        # Show all chunks (no PDF selection)
        view_btn.click(fn=view_all_chunks, outputs=view_output)
    
        def view_all_chunks(collection_name: str = "pdf_chunks"):
            client = chromadb.Client()
            try:
                collection = client.get_collection(name=collection_name)
            except Exception as e:
                return f"❌ Error: {str(e)}"
        
            results = collection.get(include=["documents", "embeddings"], limit=500)
        
            chunks = []
            for i, (doc, embedding) in enumerate(zip(results["documents"], results["embeddings"])):
                chunks.append(f"🧩 Chunk {i+1}:\n{doc}\n🔢 Embedding (first 5 values): {embedding[:5]}\n{'-'*40}")
            return "\n".join(chunks) if chunks else "⚠️ No chunks found in collection."

    with gr.Tab("Query Knowledge Nuggets"):
        # PDF selection for embedding
        view_btn.click(fn=view_all_chunks, outputs=view_output)


demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7883
IMPORTANT: You are using gradio version 3.39.0, however version 4.44.1 is available, please upgrade.
--------


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running on public URL: https://712262c4f1220c50aa.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


