In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer


In [2]:
DATA_PATH = "../data/processed/filtered_complaints.csv"

df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (80667, 20)


Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zip_code,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed,complaint_id,word_count,cleaned_narrative
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,Servicemember,Consent provided,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,a xxxx xxxx card was opened under my name by a...
1,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,,Consent provided,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,dear cfpb i have a secured credit card with ci...
2,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,,Consent provided,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,i have a citi rewards cards the credit balance...
3,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,b i am writing to dispute the following charge...
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem when making payments,Problem during payment process,"Although the account had been deemed closed, I...",Company believes it acted appropriately as aut...,Atlanticus Services Corporation,NY,11212,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13965746,170,although the account had been deemed closed i ...


In [4]:

df_sample = df.sample(n=min(10000, len(df)), random_state=42)

print("Working sample shape:", df_sample.shape)


Working sample shape: (10000, 20)


In [5]:
def chunk_text(text, chunk_size=500, overlap=50):
    """
    Splits text into overlapping chunks.
    """
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap

    return chunks


In [6]:
all_chunks = []
metadata = []

for idx, row in df_sample.iterrows():
    chunks = chunk_text(row["cleaned_narrative"])

    for i, chunk in enumerate(chunks):
        all_chunks.append(chunk)
        metadata.append({
            "complaint_id": row["complaint_id"],
            "product": row["product"],
            "chunk_index": i
        })

print("Total chunks created:", len(all_chunks))


Total chunks created: 29038


In [7]:
#Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
embeddings = model.encode(
    all_chunks,
    show_progress_bar=True
)

print("Embedding shape:", embeddings.shape)


Batches:   0%|          | 0/908 [00:00<?, ?it/s]

Embedding shape: (29038, 384)


In [9]:
#Store embeddings in FAISS
import faiss
import numpy as np

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

index.add(np.array(embeddings))

print("Total vectors in index:", index.ntotal)


Total vectors in index: 29038


In [10]:
faiss.write_index(index, "../vector_store/complaints_faiss.index")

print("FAISS index saved.")


FAISS index saved.


In [11]:
# Test semantic search
query = "unauthorized credit card charges"

# Embed the query
query_embedding = model.encode([query])

# Search top 5 similar chunks
D, I = index.search(query_embedding, k=5)

print("Top 5 distances:", D[0])
print("Top 5 indices:", I[0])


Top 5 distances: [0.42959756 0.4936814  0.51731825 0.52561575 0.527644  ]
Top 5 indices: [18923 18922 27471 15485 28679]


In [12]:
# Show retrieved complaint chunks
for idx in I[0]:
    print("-" * 80)
    print(all_chunks[idx][:500])


--------------------------------------------------------------------------------
rd i firmly believe that my card may have been used by someone without my consent leading to this unauthorized charge in light of this i felt compelled to dispute the transaction and subsequently canceled my card to prevent any further unauthorized activity this unauthorized transactions have caused significant financial strain and i am eager to resolve this matter promptly i am fully committed to cooperating with your investigation and providing any necessary information to clarify the nature o
--------------------------------------------------------------------------------
bring to your attention that the particular charge in question was not authorized by me i discovered that charge on my credit card statement that i did not authorize or recognize after thorough investigation it has become evident that this transaction was not initiated by me and i believe my card information may have been compromised t