## Import Library

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
from src.embedding import ComplaintEmbeddingPipeline
pipeline = ComplaintEmbeddingPipeline(csv_path="../data/processed/filtered_complaints.csv",
    product_col="Product",
    narrative_col="cleaned_narrative",
    complaint_id_col="Complaint ID",
    sample_size=14000,
    chunk_size=500,
    chunk_overlap=100,
    embedding_model_name="sentence-transformers/all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


## Load cleaned data from Task 1

In [2]:
pipeline.load_data()
pipeline.df.head()

[INFO] Loaded 454472 complaints from ../data/processed/filtered_complaints.csv


Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,...,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_word_count,cleaned_narrative
0,12237,2025-06-13,credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,...,Servicemember,Consent provided,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,a xxxx xxxx card was opened under my name by a...
1,12532,2025-06-13,checking or savings account,Checking account,Managing an account,Deposits and withdrawals,I made the mistake of using my wellsfargo debi...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,ID,...,,Consent provided,Web,2025-06-13,Closed with explanation,Yes,,14061897,109,i made the mistake of using my wellsfargo debi...
2,13280,2025-06-12,credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,...,,Consent provided,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,dear cfpb i have a secured credit card with ci...
3,13506,2025-06-12,credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,...,,Consent provided,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,i have a citi rewards cards the credit balance...
4,13955,2025-06-09,credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,...,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,bi am writing to dispute the following charges...


## Stratified sampling by 'product_category'

In [3]:
# Target sample size (choose between 10,000–15,000)
pipeline.stratified_sample()



[INFO] Stratified sample created with 14000 complaints.


## Sanity Check (Important)

In [4]:
# Verify proportional distribution
pd.concat([
    pipeline.df[pipeline.product_col].value_counts(normalize=True).rename("Original"),
    pipeline.df_sample[pipeline.product_col].value_counts(normalize=True).rename("Sample")
], axis=1)


Unnamed: 0_level_0,Original,Sample
Product,Unnamed: 1_level_1,Unnamed: 2_level_1
checking or savings account,0.308752,0.308786
credit card or prepaid card,0.239106,0.239071
"money transfer, virtual currency, or money service",0.213848,0.213857
credit card,0.177496,0.1775
"payday loan, title loan, or personal loan",0.03793,0.037929
"payday loan, title loan, personal loan, or advance loan",0.019574,0.019571
money transfers,0.003294,0.003286


## Text Chunking Strategy

In [5]:
# Final chunking configuration
pipeline.chunk_text()
pipeline.chunks_df.head()

[INFO] created 43129 text chunks


Unnamed: 0,complaint_id,product_category,chunk_index,chunk_text
0,11631421,"money transfer, virtual currency, or money ser...",0,i am writing to formally express my concerns r...
1,11631421,"money transfer, virtual currency, or money ser...",1,these transactions was unfeasible the cfpb s f...
2,8456496,checking or savings account,0,on xxxx2024 we terminated the office manager o...
3,8456496,checking or savings account,1,a pile of unopened mail in the drawer of her d...
4,3032678,credit card or prepaid card,0,i was very happy to apply american express xxx...


In [6]:
pipeline.chunks_df.shape

(43129, 4)

## Load Embedding Model

In [7]:
pipeline.load_embedding_model()


[INFO] lOADED EMBEDDING MODEL: sentence-transformers/all-MiniLM-L6-v2


## Generate Embeddings

In [8]:
pipeline.generate_embeddings(batchsize=64)
pipeline.chunks_df["embedding"].iloc[0].shape


[INFO] Generated embeddings for 43129 chunks


(384,)

## Save outputs (metadata + embeddings)

In [9]:
pipeline.save_chunks_and_embeddings(
    metadata_path="../vector_store/chunks_metadata.csv",
    embedding_path="../vector_store/chunks_embeddings.npy"
)


[INFO] Saved chunks metadata to ../vector_store/chunks_metadata.csv
[INFO] Saved embeddings array to ../vector_store/chunks_embeddings.npy
