In [None]:
import os
import chromadb
from pypdf import PdfReader
from chromadb.utils import embedding_functions
# Import the specific text splitter class
from langchain_text_splitters import RecursiveCharacterTextSplitter

def load_pdf_into_chromadb(file_path: str, collection_name: str = "alphabet_10k_collection_chunks", db_path: str = "../chroma_db_chunks"):
    """
    Reads a PDF file, chunks its content using RecursiveCharacterTextSplitter,
    and loads it into ChromaDB.
    """
    # 1. Extract raw text from PDF
    reader = PdfReader(file_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() + "\n" # Append text and a newline

    if not full_text:
        print(f"Could not extract text from {file_path}")
        return

    # 2. Initialize the RecursiveCharacterTextSplitter
    # This splitter attempts to keep paragraphs and sentences intact
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len, # Measures length by character count
    )

    # Split the single large text string into a list of smaller strings (chunks)
    chunks = text_splitter.split_text(full_text)
    
    # Create metadata for each chunk (simple metadata example)
    documents = []
    metadata_list = []
    ids = []
    
    for i, chunk in enumerate(chunks):
        documents.append(chunk)
        metadata_list.append({"source": os.path.basename(file_path), "chunk_id": i + 1})
        ids.append(f"{os.path.basename(file_path)}_chunk_{i+1}")

    # 3. Setup ChromaDB client and embedding function
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
    client = chromadb.PersistentClient(path=db_path)
    collection = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

    # 4. Add documents to the collection
    collection.add(
        documents=documents,
        metadatas=metadata_list,
        ids=ids
    )

    print(f"Successfully loaded {len(documents)} chunks into ChromaDB collection '{collection_name}' using RecursiveCharacterTextSplitter.")






In [4]:
# Run the utility
pdf_file_path = "../data/alphabet-form-10-K-2024.pdf"
load_pdf_into_chromadb(pdf_file_path)

Successfully loaded 406 chunks into ChromaDB collection 'alphabet_10k_collection_chunks' using RecursiveCharacterTextSplitter.
