In [None]:
!pip install langchain-core langchain-community pymupdf

In [None]:
from langchain_core.documents import Document

In [None]:
doc=Document(
    page_content="Hello world",
    metadata={
        "source":"example.txt",
        "pages":1,
        "author":"Aashik Thakur",
        "date_created":"2025-12-17"
    }
)
doc

In [None]:
### TextLoader
from langchain_community.document_loaders import TextLoader
loader=TextLoader("text-files/ai.txt",encoding="utf-8")
document=loader.load()
print(document)


In [None]:
### directory loader
from langchain_community.document_loaders import DirectoryLoader

## load all the files from a directory
dir_loader=DirectoryLoader(
    "text-files",
    glob="**/*.txt", ##pattern to match files
    loader_cls=TextLoader, ## loader class to use
    loader_kwargs={"encoding":"utf-8"}
)
documents=dir_loader.load()
documents

In [None]:
### load a pdf file
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

dir_loader=DirectoryLoader(
    "pdf-files",
    glob="**/*.pdf", ##pattern to match files
    loader_cls=PyMuPDFLoader ## loader class to use
)

pdf_documents = dir_loader.load()
pdf_documents

In [9]:

!pip install chromadb


Collecting chromadb
  Downloading chromadb-1.3.7-cp39-abi3-win_amd64.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp313-cp313-win_amd64.whl.metadata (9.1 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.38.0-py3-none-any.whl.metadata (6.8 kB)
Collecting numpy>=1.22.5 (from chromadb)
  Using cached numpy-2.3.5-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Using cached onnxruntime-1.23.2-cp313-cp313-win_amd64.whl.metadata (5.3 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Using cached opentelemetry_api-1.39.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Using 

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.61.0 requires numpy<2.2,>=1.24, but you have numpy 2.3.5 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 2.3.5 which is incompatible.


In [10]:
import os
from typing import List, Any
import chromadb
import chromadb.config

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


class ChromaVectorStore:
    def __init__(
        self,
        persist_dir: str = "chroma_store",
        embedding_model_name: str = "all-MiniLM-L6-v2",
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
    ):
        self.persist_dir = persist_dir
        os.makedirs(self.persist_dir, exist_ok=True)

        # Correctly initialize embedding model
        self.embedding_function = SentenceTransformerEmbeddings(
            model_name=embedding_model_name
        )

        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        # Load existing DB if present
        self.vectorstore = Chroma(
            persist_directory=self.persist_dir,
            embedding_function=self.embedding_function,
        )

        print(f"[OK] Using embedding model: {embedding_model_name}")

    def build_from_documents(self, documents: List[Any]):
        if not documents:
            raise ValueError("No documents provided")

        print(f"[INFO] Building vector store from {len(documents)} documents")

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
        )

        chunks = splitter.split_documents(documents)

        self.vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=self.embedding_function,
            persist_directory=self.persist_dir,
        )

        self.vectorstore.persist()
        print(f"[OK] Vector store saved to {self.persist_dir}")

    def query(self, query_text: str, top_k: int = 5):
        if not self.vectorstore:
            raise RuntimeError("Vector store not initialized")

        print(f"[INFO] Querying for: {query_text}")

        return self.vectorstore.similarity_search(query_text, k=top_k)


In [None]:
!pip install sentence-transformers
# Initialize vector store
chroma_store = ChromaVectorStore()

# Build only if empty
if chroma_store.vectorstore._collection.count() == 0:
    chroma_store.build_from_documents(pdf_documents)
else:
    print("[INFO] Using existing Chroma DB")

# Query
query = "Explain attention mechanism in transformer neural networks"
query_results = chroma_store.query(query, top_k=2)

# Display results
if not query_results:
    print("No relevant documents found.")
else:
    for i, doc in enumerate(query_results, 1):
        print(f"\nResult {i}:")
        print(doc.page_content)
        print("Metadata:", doc.metadata)
