## The embedding from Chroma DB

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader

# 1. Load the Document
loader = PyPDFLoader("your_document.pdf")
documents = loader.load()

# 2. Prepare Metadata
metadata = [{"page_number": doc.metadata['page_number']} for doc in documents]

# 3. Generate Embeddings
embeddings = OpenAIEmbeddings(model_name="text-embedding-ada-002")
document_embeddings = embeddings.embed_documents([doc.page_content for doc in documents])

# 4. Create or Load a Collection in ChromaDB
collection_name = "my_document_collection"
vectorstore = Chroma.from_embeddings(embeddings=document_embeddings, documents=documents, metadatas=metadata, persist_directory="chroma_db", collection_name=collection_name)
vectorstore.persist()

print(f"Embeddings and metadata persisted successfully in collection: {collection_name}.")


## Adding the Embeddings of the books

In [None]:
from chromadb import Client

# Initialize ChromaDB client
client = Client()

# Load the existing database
collection = client.get_collection('your_collection_name')


In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

# Load your new PDF content
new_pdf_text = load_pdf('new_resource.pdf')

# Split the text for embedding
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_text(new_pdf_text)

# Generate embeddings
embeddings = OpenAIEmbeddings()
new_embeddings = embeddings.embed_documents(texts)


In [None]:
# Add new embeddings to the collection
collection.add(
    documents=texts,          # The new texts or documents
    embeddings=new_embeddings # The corresponding embeddings
)


In [None]:
client.persist()
print(f"Total number of documents: {collection.count()}") 

## Delete Vector DB

In [None]:
document_ids_to_delete = ["doc_id_1", "doc_id_2"]


In [None]:
# Perform a search to identify documents to delete
results = collection.query(query_texts=["some search text"], n_results=5)
document_ids_to_delete = [result['id'] for result in results['documents']]


In [None]:
# Delete documents by their IDs
collection.delete(ids=document_ids_to_delete)


In [None]:
client.persist()

In [None]:
print(f"Total number of documents after deletion: {collection.count()}")
