In [None]:
import numpy as np
import chromadb
import uuid
from typing import List, Dict, Tuple, Any
from sklearn.metrics.pairwise import cosine_similarity
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import os

In [None]:
class EmbeddingManager:
    """Handles document embeddings generation using SentenceTransformer."""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the EmbeddingManager with the specified model name.

        Args:
              model_name (str): Hugging Face model name to use for embedding generation. Defaults to "all-MiniLM-L6-v2.
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model."""
        try:
            print(f'loading embedding model: {self.model_name}')
            self.model = SentenceTransformer(self.model_name)
            print(f'embedding model loaded successfully. Embedding dimension: {self.get_embedding_dimension()}')
        except Exception as e:
            print(f"Erorr loading model {self.model_name}: {e}")
            raise  # Giống với throw exception trong Java

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts.

        :param texts: List of text strings to embed
        :return: a numpy array of embeddings with shapes (len(texts), embedding_dim)
        """

        if not self.model:  # Nếu chưa tồn tại object
            raise ValueError("Model not loaded")

        print(f"Generating embedding model for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Embeddings generated successfully. Shape: {embeddings.shape}")
        return embeddings

    def get_embedding_dimension(self) -> int:
        """Get the dimension of the embedding model"""

        if not self.model:
            raise ValueError("Model not loaded")

        return self.model.get_sentence_embedding_dimension()


# Initialize the embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

In [None]:
### Vector Store Database
class VectorStore:
    """Manages documents embeddings in a ChromaDB vector store"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Initialize the vector store

        :param collection_name: Name of the ChromaDB collection to use.
        :param persist_directory: Directory to persist to vector store
        """

        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize the ChromaDB client and collection"""
        try:
            # Create persistent chromadb client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create a collection
            self.collection = self.client.get_or_create_collection(
                # Giống với table trong db - CREATE TABLE IF NOT EXISTS
                name=self.collection_name,
                metadata={"description": "PDF documents embeddings for RAG"}
            )

            print(f"Vector store initialized successfully. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing ChromaDB client: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add lists of langchain documents and embeddings to the vector store

        :param documents: List of langchain documents
        :param embeddings: Corresponding embeddings for the documents
        """

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must be equal")

        # Prepare data for db
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(
                zip(documents, embeddings)):  # zip để đóng các cặp doc-emb tươgn ứng lại với nhau
            #Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            #Document content
            documents_text.append(doc.page_content)

            #Embedding
            embeddings_list.append(embedding)

        #Add to a collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Documents added to collection: {self.collection_name}")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to collection: {e}")
            raise


vector_store = VectorStore()
vector_store