In [1]:
import pandas as pd



In [3]:
# Load preprocessed data
df = pd.read_csv("../data/filtered_complaints.csv")
# Display first few rows
df[['Product', 'cleaned_narrative']].head()

Unnamed: 0,Product,cleaned_narrative
0,Credit card,a xxxx xxxx card was opened under my name by a...
1,Credit card,dear cfpb i have a secured credit card with ci...
2,Credit card,i have a citi rewards cards the credit balance...
3,Credit card,bi am writing to dispute the following charges...
4,Credit card,although the account had been deemed closed i ...


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [None]:
# 2. Chunk Text Using LangChain's
# Initialize splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,       # You can adjust this
    chunk_overlap=50      # Overlap to maintain context
)

# Apply chunking to each narrative
docs = []
for idx, row in df.iterrows():
    chunks = text_splitter.split_text(row['cleaned_narrative'])
    for chunk in chunks:
        docs.append({
            "complaint_id": idx,
            "product": row['Product'],
            "text": chunk
        })

print(f"Total chunks created: {len(docs)}")


Total chunks created: 412349


In [6]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#3. Choose and Load an Embedding Model
# # Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [8]:
#4. Generate Embeddings for Chunks
# Get all text chunks
texts = [doc['text'] for doc in docs]

# Generate embeddings
embeddings = model.encode(texts, show_progress_bar=True)


Batches: 100%|██████████| 12886/12886 [1:08:45<00:00,  3.12it/s]


In [9]:
import chromadb
from chromadb.utils import embedding_functions

In [10]:
# Set Up ChromaDB Collection

# Create persistent ChromaDB client
chroma_client = chromadb.PersistentClient(path="../vector_store/chroma")

# Create embedding function using sentence-transformers
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create (or get) collection
collection = chroma_client.get_or_create_collection(
    name="complaints",
    embedding_function=embedding_fn
)


In [13]:
from tqdm import tqdm

In [14]:
# -------------------------
# Step 4: Prepare Data and Add to ChromaDB
# -------------------------

# Constants
MAX_BATCH_SIZE = 5000  # must be ≤ 5461

# Generate unique string IDs
ids = [str(i) for i in range(len(docs))]

# Extract text and metadata
documents = [doc["text"] for doc in docs]
metadatas = [{"complaint_id": doc["complaint_id"], "product": doc["product"]} for doc in docs]

# Sanity check
assert len(ids) == len(documents) == len(metadatas), "❌ Mismatch in data lengths!"

# Add in batches
print("🚀 Adding vectors to ChromaDB in batches...")
for i in tqdm(range(0, len(ids), MAX_BATCH_SIZE), desc="Adding to ChromaDB"):
    try:
        batch_ids = ids[i:i + MAX_BATCH_SIZE]
        batch_docs = documents[i:i + MAX_BATCH_SIZE]
        batch_meta = metadatas[i:i + MAX_BATCH_SIZE]

        collection.add(
            ids=batch_ids,
            documents=batch_docs,
            metadatas=batch_meta
        )
    except Exception as e:
        print(f"❌ Error adding batch {i // MAX_BATCH_SIZE + 1}: {e}")

print(f"✅ Successfully stored {len(ids)} vectors in ChromaDB.")


🚀 Adding vectors to ChromaDB in batches...


Adding to ChromaDB: 100%|██████████| 83/83 [1:23:07<00:00, 60.09s/it] 

✅ Successfully stored 412349 vectors in ChromaDB.



