In [1]:
import pandas as pd

df = pd.read_csv('../data/processed/filtered_complaints.csv')
print(df.head())

  Date received                      Product  \
0    2025-06-13                  Credit card   
1    2025-06-13  Checking or savings account   
2    2025-06-12                  Credit card   
3    2025-06-12                  Credit card   
4    2025-06-09                  Credit card   

                                  Sub-product  \
0                           Store credit card   
1                            Checking account   
2  General-purpose credit card or charge card   
3  General-purpose credit card or charge card   
4  General-purpose credit card or charge card   

                                             Issue  \
0                            Getting a credit card   
1                              Managing an account   
2               Other features, terms, or problems   
3             Incorrect information on your report   
4  Problem with a purchase shown on your statement   

                                           Sub-issue  \
0        Card opened without my con

In [2]:
# Word-based chunker
def word_chunker(text, chunk_size=200, chunk_overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i+chunk_size]
        chunks.append(' '.join(chunk))
        i += chunk_size - chunk_overlap
    return chunks

# Recursive character splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

def recursive_chunker(text, chunk_size=600, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)


In [4]:
word_chunks = []
word_metadata = []

recursive_chunks = []
recursive_metadata = []

for idx, row in df.iterrows():
    text = row['cleaned_narrative']
    product = row['Product']

    # Word-based
    w_chunks = word_chunker(text)
    for chunk in w_chunks:
        word_chunks.append(chunk)
        word_metadata.append({'product': product, 'source_row': idx, 'chunk': chunk})

    # Recursive
    r_chunks = recursive_chunker(text)
    for chunk in r_chunks:
        recursive_chunks.append(chunk)
        recursive_metadata.append({'product': product, 'source_row': idx, 'chunk': chunk})

print(f"Word-based total chunks: {len(word_chunks)}")
print(f"Recursive total chunks: {len(recursive_chunks)}")

Word-based total chunks: 626628
Recursive total chunks: 946278


In [5]:
# Choose a manageable sample size
sample_size = 50000  # adjust if needed

# Word-based sample
word_chunks_sample = word_chunks[:sample_size]
word_metadata_sample = word_metadata[:sample_size]

# Recursive sample
recursive_chunks_sample = recursive_chunks[:sample_size]
recursive_metadata_sample = recursive_metadata[:sample_size]

print(f"Word-based sample: {len(word_chunks_sample)} chunks")
print(f"Recursive sample: {len(recursive_chunks_sample)} chunks")

Word-based sample: 50000 chunks
Recursive sample: 50000 chunks


In [6]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)


Using device: cpu


In [None]:
import faiss
import numpy as np
import pickle

# Embed word-based sample
word_embeddings = model.encode(
    word_chunks_sample,
    batch_size=128,
    show_progress_bar=True
)
word_embeddings = np.array(word_embeddings).astype('float32')

# Build FAISS index
word_index = faiss.IndexFlatL2(word_embeddings.shape[1])
word_index.add(word_embeddings)

# Save index
faiss.write_index(word_index, '../data/processed/faiss_word_sample.index')

# Save metadata
with open('../data/processed/word_metadata_sample.pkl', 'wb') as f:
    pickle.dump(word_metadata_sample, f)

print(f"✅ Word-based sample index and metadata saved.")

Batches: 100%|██████████| 391/391 [36:36<00:00,  5.62s/it]


✅ Word-based sample index and metadata saved.


In [6]:
import faiss
import numpy as np
import pickle

# Embed recursive sample
recursive_embeddings = model.encode(
    recursive_chunks_sample,
    batch_size=128,
    show_progress_bar=True
)
recursive_embeddings = np.array(recursive_embeddings).astype('float32')

# Build FAISS index
recursive_index = faiss.IndexFlatL2(recursive_embeddings.shape[1])
recursive_index.add(recursive_embeddings)

# Save index
faiss.write_index(recursive_index, '../data/processed/faiss_recursive_sample.index')

# Save metadata
with open('../data/processed/recursive_metadata_sample.pkl', 'wb') as f:
    pickle.dump(recursive_metadata_sample, f)

print(f"✅ Recursive sample index and metadata saved.")


Batches: 100%|██████████| 391/391 [27:49<00:00,  4.27s/it]


✅ Recursive sample index and metadata saved.


In [None]:
import faiss

# Example test query
query = "Why are people unhappy with credit card charges?"
query_embedding = model.encode([query]).astype('float32')

word_index = faiss.read_index('../data/processed/faiss_word_sample.index')

# Word-based search
D_word, I_word = word_index.search(query_embedding, k=5)
print("\n🔍 Word-based top 5:")
for i in I_word[0]:
    print("-", word_chunks_sample[i][:200], "...")

recursive_index = faiss.read_index('../data/processed/faiss_recursive_sample.index')

# Recursive search
D_rec, I_rec = recursive_index.search(query_embedding, k=5)
print("\n🔍 Recursive top 5:")
for i in I_rec[0]:
    print("-", recursive_chunks_sample[i][:200], "...")


🔍 Word-based top 5:
- card on xxxxyear and told the person this charge was excessive and they refused to offer credit to my account i was refused credit or any consideration by xxxx different peopleas a xxxx xxxx xxxx xxxx ...
- by the website itself i am still being charged interest on a card i simply can not pay the balance of again i repeat this is the case of many people these are unjust and illegal practices by comenity  ...
- actions when it comes to using a credit card ...
- i have a couple of store credit cards and noticed they are now charging a fee for paper statements i feel this is something the cfpb should address as it is taking advantage of the consumer digital st ...
- a productive call and hung up on me it is predatory to add on fees before a payment is made and to not have that included in the payment when opting to pay off the entire card and it is predatory to o ...

🔍 Recursive top 5:
- it makes it confusing so that they can assess these insane rates higher than a

In [None]:
# Rebuild the sample recursive metadata with the chunk text included

import pickle

print(f"Sample size: {len(recursive_chunks_sample)} chunks")
print(f"Original metadata entries: {len(recursive_metadata_sample)}")

# ✅ Rebuild metadata for the sample to include `chunk` text
recursive_sample_metadata = []
for old_meta, chunk in zip(recursive_metadata_sample, recursive_chunks_sample):
    new_meta = dict(old_meta)
    new_meta["chunk"] = chunk
    recursive_sample_metadata.append(new_meta)

print("✅ New sample metadata example:", recursive_sample_metadata[0])

# ✅ Save the fixed sample metadata
with open("../vector_store/recursive_sample_metadata.pkl", "wb") as f:
    pickle.dump(recursive_sample_metadata, f)

print("✅ Done! Your sample metadata now includes the chunk text.")


Sample size: 50000 chunks
Original metadata entries: 50000
✅ New sample metadata example: {'product': 'Credit card', 'source_row': 0, 'chunk': 'a xxxx xxxx card was opened under my name by a fraudster i received a notice from xxxx  that an account was just opened under my name i reached out to xxxx xxxx to state that this activity was unauthorized and not me xxxx xxxx confirmed this was fraudulent and immediately closed the card however they have failed to remove this from the three credit agencies and this fraud is now impacting my credit score based on a hard credit pull done by xxxx xxxx that was done by a fraudster'}
✅ Done! Your sample metadata now includes the chunk text.
