In [4]:
from langchain_core.documents import Document


In [None]:
doc = Document(
    page_content="This is a test document.",
    metadata={"source": "test_source"},
)
doc

Document(metadata={'source': 'test_source'}, page_content='This is a test document.')

: 

In [6]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

DATA_DIR = "../data/pdffiles"
# Load PDFs
documents = DirectoryLoader(
    DATA_DIR,
    glob="*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False
).load()

# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150
)

chunks = text_splitter.split_documents(documents)

print("Pages:", len(documents))
print("Chunks:", len(chunks))
chunks[0]

Pages: 1
Chunks: 5


Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-12-14T19:42:54+05:30', 'source': '..\\data\\pdffiles\\Narra_Devender_dev.pdf', 'file_path': '..\\data\\pdffiles\\Narra_Devender_dev.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'devendernarra@gmail.com', 'subject': '', 'keywords': '', 'moddate': '2025-12-14T19:42:54+05:30', 'trapped': '', 'modDate': "D:20251214194254+05'30'", 'creationDate': "D:20251214194254+05'30'", 'page': 0}, page_content='DEVENDER NARRA \n          GitHuB LinkedIn devendernarra@gmail.com 9849619620 | Warangal | Telangana |  India \n \n   Cloud / DevOps Engineer with strong backend development experience and hands-on expertise in AWS, \nKubernetes, Docker, and CI/CD automation. Skilled in deploying cloud-native applications, designing \nevent-driven systems, and supporting scalable, production-ready infrastructure.          \n                                                         

In [14]:
import os
import uuid
from typing import List
import numpy as np

import chromadb
from sentence_transformers import SentenceTransformer

from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document


# ---------------- EMBEDDING MANAGER ---------------- #

class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        print(f"\nLoading model: {self.model_name}...")
        self.model = SentenceTransformer(self.model_name)
        print("Model loaded successfully.")
        print("Embedding dimension:", self.model.get_sentence_embedding_dimension())

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if self.model is None:
            raise ValueError("Model not loaded.")

        print(f"\nGenerating embeddings for {len(texts)} chunks...")
        embeddings = self.model.encode(
            texts,
            show_progress_bar=True,
            normalize_embeddings=True
        )
        print("Embeddings generated successfully.")
        return embeddings


# ---------------- VECTOR STORE ---------------- #

class VectorStore:

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vectorstore"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory

        print("\nInitializing ChromaDB...")
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        self.collection = self.client.get_or_create_collection(name=self.collection_name)
        print("Vector database ready.")

    def add_documents(self, chunks: List[Document], vectors: np.ndarray):

        print(f"\nStoring {len(chunks)} chunks into vector DB...")

        ids = [str(uuid.uuid4()) for _ in range(len(chunks))]
        documents = [chunk.page_content for chunk in chunks]
        metadatas = [chunk.metadata for chunk in chunks]

        self.collection.add(
            ids=ids,
            documents=documents,
            embeddings=vectors.tolist(),
            metadatas=metadatas
        )

        print("Storage completed successfully.")


# ---------------- MAIN PIPELINE ---------------- #

def main():

    # 1️⃣ Load PDFs
    DATA_DIR = "../data/pdffiles"

    print("\nLoading documents...")
    loader = DirectoryLoader(DATA_DIR, glob="*.pdf", loader_cls=PyMuPDFLoader)
    documents = loader.load()

    print(f"Loaded {len(documents)} pages")

    # 2️⃣ Chunking
    print("\nSplitting into chunks...")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150
    )

    chunks = splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks")

    # 3️⃣ Extract text
    texts = [chunk.page_content for chunk in chunks]

    # 4️⃣ Embeddings
    embedder = EmbeddingManager()
    vectors = embedder.generate_embeddings(texts)

    print("\nFinal Stats:")
    print("Chunks:", len(chunks))
    print("Vector shape:", vectors.shape)

    # 5️⃣ Store in Vector DB
    vectorstore = VectorStore()
    vectorstore.add_documents(chunks, vectors)


if __name__ == "__main__":
    main()



Loading documents...
Loaded 1 pages

Splitting into chunks...
Created 5 chunks

Loading model: all-MiniLM-L6-v2...


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 609.08it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model loaded successfully.
Embedding dimension: 384

Generating embeddings for 5 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s]


Embeddings generated successfully.

Final Stats:
Chunks: 5
Vector shape: (5, 384)

Initializing ChromaDB...
Vector database ready.

Storing 5 chunks into vector DB...
Storage completed successfully.
