In [37]:
from pymongo import MongoClient
from typing import List, Dict, Optional

In [1]:
def check_mongo_connection(uri) -> bool:
    try:
        client = MongoClient(uri, serverSelectionTimeoutMS=3000)
        client.admin.command("ping")  # lightweight command to check connection
        print("✅ Successfully connected to MongoDB.")
        return True
    except Exception as e:
        print("❌ Failed to connect to MongoDB.")
        print("Error:", e)
        return False

In [39]:
uri="mongodb+srv://Aaron:1234@cluster0.erwea75.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
check_mongo_connection(uri)

✅ Successfully connected to MongoDB.


True

In [40]:
def get_mongo_client(uri) -> MongoClient:
    """
    Create and return a MongoDB client.
    """
    return MongoClient(uri)

def get_collection(client: MongoClient, db_name: str, collection_name: str):
    """
    Get a reference to a specific MongoDB collection.
    
    Parameters:
        client: MongoDB client
        db_name: Name of the database
        collection_name: Name of the collection
    
    Returns:
        A collection object ready for queries and inserts.
    """
    return client[db_name][collection_name]


In [41]:
client = get_mongo_client(uri)
client

MongoClient(host=['ac-59lzpu1-shard-00-01.erwea75.mongodb.net:27017', 'ac-59lzpu1-shard-00-00.erwea75.mongodb.net:27017', 'ac-59lzpu1-shard-00-02.erwea75.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='Cluster0', authsource='admin', replicaset='atlas-gzneto-shard-0', tls=True)

In [42]:
def insert_pdf_pages_to_mongo(collection, pdf_name: str, pages: List[Dict]):
    """
    Insert extracted PDF pages into MongoDB with pdf_name context.

    Parameters:
        collection: The MongoDB collection
        pdf_name: The name of the PDF file (e.g., "example.pdf")
        pages: List of dicts like [{ "page_number": int, "text": str }, ...]
    """
    if not pages:
        print("⚠️ No pages to insert.")
        return

    documents = []
    for page in pages:
        documents.append({
            "pdf_name": pdf_name,
            "page_number": page["page_number"],
            "text": page["text"]
        })

    collection.insert_many(documents)
    print(f"✅ Inserted {len(documents)} pages from '{pdf_name}' into MongoDB.")

In [43]:
def insert_chunks_to_mongo(
    chunks: List[Dict],
    db_name: str = "pdf_rag_db",
    collection_name: str = "chunks"
):
    """
    Inserts chunked documents into a MongoDB collection.

    Parameters:
        chunks: List of dictionaries returned from chunking()
        db_name: Name of the MongoDB database
        collection_name: Name of the collection to insert into (default: "chunks")
    """
    if not chunks:
        print("⚠️ No chunks to insert.")
        return

    client = get_mongo_client(uri)
    chunks_collection = get_collection(client, db_name, collection_name)
    
    chunks_collection.insert_many(chunks)
    print(f"✅ Inserted {len(chunks)} chunks into MongoDB collection '{collection_name}'.")

In [2]:
import chromadb

def view_all_chunks(collection_name: str = "default"):
    client = chromadb.Client()
    collection = client.get_collection(name=collection_name)
    
    # Fetch all documents (with a large limit)
    results = collection.get(include=["documents", "embeddings"], limit=500)

    print(f"\n📦 Found Aaron {len(results['documents'])} documents in '{collection_name}' collection:\n")
    for i, (doc, embedding) in enumerate(zip(results["documents"], results["embeddings"])):
        print(f"🧩 Chunk {i+1}:\n{doc}\n🔢 Embedding (first 5 values): {embedding[:5]}\n{'-'*40}")


In [3]:
import chromadb
from chromadb.config import Settings
from chromadb.errors import NotFoundError

def check_chroma_connection(persist_directory: str = "./chroma_store", collection_name: str = "pdf_chunks"):
    try:
        # Setup Chroma client with persistent storage
        client = chromadb.Client(Settings(
            chroma_db_impl="duckdb+parquet",
            persist_directory=persist_directory
        ))

        # Try getting the collection or create if it doesn't exist
        try:
            collection = client.get_collection(name=collection_name)
            print(f"✅ Successfully connected to ChromaDB. Collection '{collection_name}' found.")
        except NotFoundError:
            collection = client.create_collection(name=collection_name)
            print(f"✅ Connected to ChromaDB. Collection '{collection_name}' created.")

        return collection
    except Exception as e:
        print("❌ Failed to connect to ChromaDB.")
        print("Error:", e)
        return None