Importing dependencies

In [2]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from tqdm.auto import tqdm
import os
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

Load filtered data

In [3]:
import pandas as pd
from collections import defaultdict
import random
import os

# Set seed for reproducibility
random.seed(42)

csv_path = 'data/processed/filtered_complaints.csv'
sample_size = 12000  # Target ~12K

# We'll collect complaint_id and cleaned_narrative per category
samples = defaultdict(list)

# Desired proportions based on your known distribution (~40% CC, 33% SA, 21% MT, 6% PL)
proportions = {
    'Credit Cards': 0.403,   # 189334 / 469922 ≈ 0.403
    'Savings Accounts': 0.330,
    'Money Transfers': 0.210,
    'Personal Loans': 0.057
}

target_per_category = {cat: int(sample_size * prop) for cat, prop in proportions.items()}
print("Target per category:", target_per_category)

chunksize = 50_000  # Smaller chunks to stay safe
collected = {cat: 0 for cat in target_per_category}

print("Starting chunked sampling...")

for chunk in pd.read_csv(csv_path, usecols=['complaint_id', 'product_category', 'cleaned_narrative'], chunksize=chunksize):
    for cat in target_per_category:
        if collected[cat] >= target_per_category[cat]:
            continue  # Already enough for this category
        
        cat_mask = chunk['product_category'] == cat
        cat_chunk = chunk[cat_mask]
        
        needed = target_per_category[cat] - collected[cat]
        if len(cat_chunk) > needed:
            cat_chunk = cat_chunk.sample(n=needed, random_state=42)
        
        # Store as dicts to save memory
        for _, row in cat_chunk.iterrows():
            samples[cat].append({
                'complaint_id': row['complaint_id'],
                'product_category': cat,
                'cleaned_narrative': row['cleaned_narrative']
            })
        
        collected[cat] += len(cat_chunk)
    
    # Print progress
    print(f"Collected: {dict(collected)}")

# Combine all samples
all_samples = []
for cat_list in samples.values():
    all_samples.extend(cat_list)

# Shuffle final sample
random.shuffle(all_samples)

df_sample = pd.DataFrame(all_samples)

print(f"\nFinal sample size: {len(df_sample)}")
print("Distribution:")
print(df_sample['product_category'].value_counts())

# Save small sample for Task 2
os.makedirs('data/processed', exist_ok=True)
df_sample.to_csv('data/processed/sample_12k_complaints.csv', index=False)
print("\nSaved small sample to data/processed/sample_12k_complaints.csv")
df_sample.head()

Target per category: {'Credit Cards': 4836, 'Savings Accounts': 3960, 'Money Transfers': 2520, 'Personal Loans': 684}
Starting chunked sampling...


ParserError: Error tokenizing data. C error: out of memory

In [4]:
import pandas as pd

df_sample = pd.read_csv('data/processed/sample_12k_complaints.csv')

print(f"Loaded sample: {len(df_sample)} complaints")
print(df_sample['product_category'].value_counts())
print("\nSample narrative example:")
print(df_sample['cleaned_narrative'].iloc[0][:500] + "...")
df_sample.head()

Loaded sample: 12000 complaints
product_category
Credit Cards        4836
Savings Accounts    3960
Money Transfers     2520
Personal Loans       684
Name: count, dtype: int64

Sample narrative example:
it was an application for the xx/xx/ credit card i never finished to changed my mind, i never signed anything for a credit pull, the app started with some undisclosed pre - something i never seen any written, and i definitely did not physically sign for anything! i tried very hard to make a good credit score and i was doing real good and now, they jacked it all up. , in, i believe it was on the , elan financial, xx/xx/. thank you!...


Unnamed: 0,complaint_id,product_category,cleaned_narrative
0,10055116,Credit Cards,it was an application for the xx/xx/ credit ca...
1,11657736,Credit Cards,i need your help to review accounts from compa...
2,11771646,Savings Accounts,i am writing to file a complaint against navy ...
3,11221539,Credit Cards,the company must ensure a transparent and effi...
4,11940104,Credit Cards,"on or around xx/xx/year>, i called the chase ,..."


Text Chunking with LangChain

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# As per pre-built vector store specs
chunk_size = 500
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,  # character-based
    separators=["\n\n", "\n", " ", ""]
)

# Test on one narrative
example_text = df_sample['cleaned_narrative'].iloc[0]
chunks = text_splitter.split_text(example_text)

print(f"Example narrative split into {len(chunks)} chunks")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1} ({len(chunk)} chars):")
    print(chunk[:300] + "..." if len(chunk) > 300 else chunk)

Example narrative split into 1 chunks

Chunk 1 (434 chars):
it was an application for the xx/xx/ credit card i never finished to changed my mind, i never signed anything for a credit pull, the app started with some undisclosed pre - something i never seen any written, and i definitely did not physically sign for anything! i tried very hard to make a good cre...


Create Chunks with Metadata

In [6]:
from tqdm.auto import tqdm

chunks_with_metadata = []

for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Chunking complaints"):
    narrative = row['cleaned_narrative']
    complaint_id = str(row['complaint_id'])
    product_category = row['product_category']
    
    # Split text
    text_chunks = text_splitter.split_text(narrative)
    
    # Add metadata to each chunk
    for chunk_idx, chunk_text in enumerate(text_chunks):
        chunks_with_metadata.append({
            'text': chunk_text,
            'complaint_id': complaint_id,
            'product_category': product_category,
            'chunk_index': chunk_idx,
            'total_chunks': len(text_chunks)
        })

print(f"\nCreated {len(chunks_with_metadata)} chunks from {len(df_sample)} complaints")
print(f"Average chunks per complaint: {len(chunks_with_metadata)/len(df_sample):.2f}")

Chunking complaints: 100%|██████████| 12000/12000 [00:21<00:00, 568.48it/s] 


Created 32060 chunks from 12000 complaints
Average chunks per complaint: 2.67





In [7]:
import pickle

# Save chunks to file
with open('data/processed/chunks_with_metadata.pkl', 'wb') as f:
    pickle.dump(chunks_with_metadata, f)

print("Chunks saved to data/processed/chunks_with_metadata.pkl")

Chunks saved to data/processed/chunks_with_metadata.pkl


Generate Embeddings and Build ChromaDB

In [2]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import gc
import os

# Load chunks if kernel restarted
if 'chunks_with_metadata' not in globals():
    import pickle
    with open('data/processed/chunks_with_metadata.pkl', 'rb') as f:
        chunks_with_metadata = pickle.load(f)
    print("Loaded chunks_with_metadata from pickle file")

# Embedding function
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# Persistent Chroma client
client = chromadb.PersistentClient(path="vector_store/chroma_db")

# Correct method name!
collection = client.get_or_create_collection(
    name="complaints_sample",
    embedding_function=embedding_function
)

print(f"Collection has {collection.count()} vectors before adding")

# Prepare lists
ids = []
documents = []
metadatas = []

for i, chunk in enumerate(chunks_with_metadata):
    chunk_id = f"complaint_{chunk['complaint_id']}_chunk_{chunk['chunk_index']}"
    ids.append(chunk_id)
    documents.append(chunk['text'])
    metadatas.append({
        'complaint_id': chunk['complaint_id'],
        'product_category': chunk['product_category'],
        'chunk_index': chunk['chunk_index'],
        'total_chunks': chunk['total_chunks'],
        'source': 'sample_12k'
    })

# Add in very small batches to avoid memory spike
batch_size = 200
total_added = 0

print("Starting batch insertion...")
for start in range(0, len(ids), batch_size):
    end = min(start + batch_size, len(ids))
    print(f"Adding batch {start//batch_size + 1}: chunks {start} to {end-1}")
    
    collection.add(
        ids=ids[start:end],
        documents=documents[start:end],
        metadatas=metadatas[start:end]
    )
    
    total_added += (end - start)
    
    # Force garbage collection
    gc.collect()

print(f"\nSuccessfully added {total_added} vectors!")
print(f"Final collection count: {collection.count()}")

# Create vector_store folder if not exists
os.makedirs("vector_store/chroma_db", exist_ok=True)
print("Vector store saved at: vector_store/chroma_db")

Loaded chunks_with_metadata from pickle file


  from .autonotebook import tqdm as notebook_tqdm


Collection has 8000 vectors before adding
Starting batch insertion...
Adding batch 1: chunks 0 to 199
Adding batch 2: chunks 200 to 399
Adding batch 3: chunks 400 to 599
Adding batch 4: chunks 600 to 799
Adding batch 5: chunks 800 to 999
Adding batch 6: chunks 1000 to 1199
Adding batch 7: chunks 1200 to 1399
Adding batch 8: chunks 1400 to 1599
Adding batch 9: chunks 1600 to 1799
Adding batch 10: chunks 1800 to 1999
Adding batch 11: chunks 2000 to 2199
Adding batch 12: chunks 2200 to 2399
Adding batch 13: chunks 2400 to 2599
Adding batch 14: chunks 2600 to 2799
Adding batch 15: chunks 2800 to 2999
Adding batch 16: chunks 3000 to 3199
Adding batch 17: chunks 3200 to 3399
Adding batch 18: chunks 3400 to 3599
Adding batch 19: chunks 3600 to 3799
Adding batch 20: chunks 3800 to 3999
Adding batch 21: chunks 4000 to 4199
Adding batch 22: chunks 4200 to 4399
Adding batch 23: chunks 4400 to 4599
Adding batch 24: chunks 4600 to 4799
Adding batch 25: chunks 4800 to 4999
Adding batch 26: chunks 50

InternalError: Error in compaction: Failed to apply logs to the metadata segment

Quick Test Query

In [3]:
query = "Why are customers unhappy with credit card fees?"

results = collection.query(
    query_texts=[query],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)

print(f"Query: {query}\n")
for i in range(5):
    doc = results['documents'][0][i]
    meta = results['metadatas'][0][i]
    dist = results['distances'][0][i]
    print(f"Result {i+1} [Distance: {dist:.4f}] - {meta['product_category']} (Complaint {meta['complaint_id']})")
    print(doc[:400] + "...\n")

Query: Why are customers unhappy with credit card fees?

Result 1 [Distance: 0.3374] - Credit Cards (Complaint 1793173)
seems incompetent to be offering a credit card to customers....

Result 2 [Distance: 0.3875] - Credit Cards (Complaint 13799302)
i have been taken advantage by this credit card company disclosure failure on their part the credit limit was very low as { {$500.00} }. they did not disclose the annual fee of { {$120.00} } will be charged within months after the card was issued. the fee is almost of the credit limit and they will charge to added fees. the interest rate is very high at and the balance goes up to the point that it...

Result 3 [Distance: 0.3879] - Credit Cards (Complaint 11348756)
provisions enforced by the cfpb, and stated to that, according to the card act, creditors are required to provide justification of real processing costs when issuers impose late fees in excess of {$8.00}. assured me that american express encourages cardholders to " have a voice. ''

Create Stratified Sample(10,000-15,000 complaints)

In [None]:
# Target: ~12,000 complaints, proportional to product_category
sample_size = 12000

df_sample = df.groupby('product_category', group_keys=False).apply(
    lambda x: x.sample(int(sample_size * len(x) / len(df)), replace=False)
)

# If slightly off due to rounding, adjust
df_sample = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print(f"Sampled {len(df_sample)} complaints")
print(df_sample['product_category'].value_counts())