# Task 2: Chunking, Embedding, Indexing

In [None]:
# !pip install langchain
# !pip install sentence-transformers
!pip install chromadb
# !pip install tqdm

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from tqdm import tqdm

In [3]:
# Load the cleaned dataset
import pandas as pd
# df = pd.read_csv('../data/processed/filtered_complaints.csv')
df = pd.read_csv('./filtered_complaints.csv')

In [None]:
df.head(3)

CHUNKING

In [5]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

# Prepare documents and metadatas using a list comprehension for efficiency
documents = []
metadatas = []

for idx, row in df.iterrows():
    # Convert the 'Cleaned_Narrative' to string to handle potential non-string values
    narrative = str(row['Cleaned_Narrative'])
    chunks = text_splitter.split_text(narrative)
    for i, chunk in enumerate(chunks):
        documents.append(chunk)
        metadatas.append({
            'product': row['Product'],
            'complaint_id': str(row['Complaint ID']),
            'chunk_id': f"{row['Complaint ID']}_{i}"
        })

print(f"Total chunks created: {len(documents)}")

Total chunks created: 128840


In [6]:
# check if the number of documents matches the number of metadatas
assert len(documents) == len(metadatas), "Mismatch between documents and metadatas length"

<!--  -->

In [7]:
# check documents
documents[0]

'bank of america has charged me a 1200 monthly maintenance fee several times incorrectly i have called in the past and have been able to rescind the fee however in xxxx of 2023 it happened again i called xxxx2023 spoke with xxxx in florida and xxxx in north carolina 2 bank of america employees and explained the fee was charged even though i maintained the daily minimum balance required by the bank to have the fee waived this is evident in my bank statements there was no followup action or call to'

In [8]:
metadatas[0]

{'product': 'Checking or savings account',
 'complaint_id': '7585703',
 'chunk_id': '7585703_0'}

EMBEDING

In [9]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Model loaded: {model}")
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded: SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)
Embedding dimension: 384


In [10]:
# Generate embeddings
embeddings = model.encode(documents, show_progress_bar=True)

Batches:   0%|          | 0/4027 [00:00<?, ?it/s]

In [11]:
print(f"Embeddings shape: {embeddings.shape}")
print(f"Each embedding has {embeddings.shape[1]} dimensions")

Embeddings shape: (128840, 384)
Each embedding has 384 dimensions


Create ChromaDB Vector Store

In [16]:
# Create a persistent ChromaDB client and collection
# Specify a path to a directory where the data will be stored
client = chromadb.Client(Settings(persist_directory="./vector_store"))

# Delete collection if it exists
try:
    client.delete_collection("complaint_chunks")
except:
    pass

# Create new collection
collection = client.get_or_create_collection(name="complaint_chunks")

In [13]:
# Add to ChromaDB
for i in tqdm(range(len(documents))):
    collection.add(
        documents=[documents[i]],
        embeddings=[embeddings[i]],
        metadatas=[metadatas[i]],
        ids=[metadatas[i]['chunk_id']]
    )


print("✅ ChromaDB vector store created and saved.")


100%|██████████| 128840/128840 [1:04:01<00:00, 33.54it/s]

✅ ChromaDB vector store created and saved.





Search from chromadb

In [21]:
print(f"Added {collection.count()} documents to ChromaDB")

Added 0 documents to ChromaDB


In [19]:
# SAVE THE STORE (call persist on the client)
# client.persist() # This line is not needed in recent versions of ChromaDB with a persistent directory
print("✅ ChromaDB vector store created and saved.")

✅ ChromaDB vector store created and saved.


Now let's perform a similarity search to check the store.

In [20]:
# Example query
query_text = "problems with bank account fees"

# Generate embedding for the query
query_embedding = model.encode([query_text])

# Perform the search
results = collection.query(
    query_embeddings=[query_embedding.tolist()[0]],
    n_results=5,  # Get the top 5 most similar results
    include=['documents', 'metadatas']
)

# Display the results
print("Search Results:")
for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
    print(f"Result {i+1}:")
    print(f"  Document: {doc}")
    print(f"  Metadata: {meta}")
    print("-" * 20)

Search Results:
