# Generate a different vector store for RAG

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "dunzhang/stella_en_1.5B_v5" #"BAAI/bge-small-en-v1.5" #dunzhang/stella_en_1.5B_v5
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embd = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [3]:
import pickle
import os

# load pickled documents
pickle_file_path = '../data/storage/full_all_documents.pkl'
if os.path.exists(pickle_file_path):
    with open(pickle_file_path, 'rb') as f:
        all_pdf_docs, all_yt_docs, all_blog_docs = pickle.load(f)
else:
    print("Pickle file not found.")

#check if the documents are loaded
print("Number of PDF documents:", len(all_pdf_docs))
print("Number of YouTube documents:", len(all_yt_docs))
print("Number of blog documents:", len(all_blog_docs))

all_docs = all_pdf_docs + all_yt_docs + all_blog_docs
print("Total number of documents:", len(all_docs))

Number of PDF documents: 2048
Number of YouTube documents: 442
Number of blog documents: 11
Total number of documents: 2501


In [4]:
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import RecursiveCharacterTextSplitter


#filter complex meta data like lists from yt videos
simple_metadata_docs =  filter_complex_metadata(all_docs)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    add_start_index=True #track index in orginal document
)

print("splitting documents...")
all_splits = text_splitter.split_documents(all_docs)

print(f"split all docs: {len(all_docs)} into subdocuments: {len(all_splits)}")

#prevent issues in metadata
def clean_metadata(metadata):
    """
    Replace None values in metadata with a default value.
    """
    
    for key, value in metadata.items():
        if value is None:
            metadata[key] = ""  # Replace None with an empty string or a default value
    return metadata

print("cleaning metadata...")
for split in all_splits:
    split.metadata= clean_metadata(split.metadata)

splitting documents...
split all docs: 2501 into subdocuments: 74407
cleaning metadata...


In [None]:
from langchain_chroma import Chroma
import os
from tqdm import tqdm

vector_store_path = "../data/storage/chroma_db_stella_1.5B_chunk150"
                               
vector_store = Chroma(
    embedding_function=embd,
    persist_directory=vector_store_path,
    collection_name="full_vstore_stella1.5B_chunk150",
)

document_ids = []
for doc in tqdm(all_splits, desc="Adding documents to vector store", unit="doc"):
    doc_id = vector_store.add_documents(documents=[doc])
    document_ids.extend(doc_id)

print(f"Added {len(document_ids)} documents to the vector store. Example document ID: {document_ids[0]}")

#save db
vector_store._persist_directory

Adding documents to vector store:  67%|██████▋   | 49945/74407 [9:44:27<4:44:05,  1.44doc/s] 

In [None]:
vector_store_check = Chroma(
    embedding_function=embd,
    persist_directory=vector_store_path,
    collection_name="full_vstore_stella1.5B_chunk150",
)

# Get the number of documents in the vector store
num_documents = vector_store_check._collection.count()

# Print the number of documents
print(f"Number of documents in the vector store: {num_documents}")

# Preview one of the documents (assuming the documents are stored in a collection)
if num_documents > 0:
    # Retrieve the first document (or any document by its ID)
    document = vector_store_check._collection.peek(limit=1)
    
    # Print a preview of the document
    print("Preview of the first document:")
    print(document)
else:
    print("No documents found in the vector store.")

#check that the vector store is saved
print(os.listdir(vector_store._persist_directory))