## Import Library

In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

## Load cleaned data from Task 1

In [3]:
df = pd.read_csv('../data/processed/filtered_complaints.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,...,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_word_count,cleaned_narrative
0,12237,2025-06-13,credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,...,Servicemember,Consent provided,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,a xxxx xxxx card was opened under my name by a...
1,12532,2025-06-13,checking or savings account,Checking account,Managing an account,Deposits and withdrawals,I made the mistake of using my wellsfargo debi...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,ID,...,,Consent provided,Web,2025-06-13,Closed with explanation,Yes,,14061897,109,i made the mistake of using my wellsfargo debi...
2,13280,2025-06-12,credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,...,,Consent provided,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,dear cfpb i have a secured credit card with ci...
3,13506,2025-06-12,credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,...,,Consent provided,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,i have a citi rewards cards the credit balance...
4,13955,2025-06-09,credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,...,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,bi am writing to dispute the following charges...


## Stratified sampling by 'product_category'

In [4]:
# Target sample size (choose between 10,000–15,000)
SAMPLE_SIZE = 14000 
# Stratified sampling by product category
df_sample, _ = train_test_split(df, stratify=df['Product'], train_size=SAMPLE_SIZE, random_state=42)



## Sanity Check (Important)

In [5]:
# Verify proportional distribution
pd.concat([
    df["Product"].value_counts(normalize=True).rename("Original"),
    df_sample["Product"].value_counts(normalize=True).rename("Sample")
], axis=1)


Unnamed: 0_level_0,Original,Sample
Product,Unnamed: 1_level_1,Unnamed: 2_level_1
checking or savings account,0.308752,0.308786
credit card or prepaid card,0.239106,0.239071
"money transfer, virtual currency, or money service",0.213848,0.213857
credit card,0.177496,0.1775
"payday loan, title loan, or personal loan",0.03793,0.037929
"payday loan, title loan, personal loan, or advance loan",0.019574,0.019571
money transfers,0.003294,0.003286


## Text Chunking Strategy

In [None]:
# Final chunking configuration
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = []
for _,row in df_sample.iterrows():
    narrative = row['cleaned_narrative']
    for idx, chunk in enumerate(text_splitter.split_text(narrative)):
        chunks.append({
            'Complain_id': row['Complaint ID'],
            'Product_category': row['Product'],
            'Chunk_index': idx,
            'Chunk text': chunk
        })
    chunk_df = pd.DataFrame(chunks)
    chunk_df.head()

## Load Embedding Model

In [None]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", use_auth_token=False,device='cpu')

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 0a5e6b37-3db1-4e32-92cc-5003550f5b07)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d501c46c-fa48-46a9-8d78-33884f72c1eb)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./config_sentence_transformers.json
Retrying in 1s [Retry 1/5].


README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: ce3c4278-57e1-42e7-9ffc-605e0816aa00)')' thrown while requesting GET https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors
Retrying in 1s [Retry 1/5].


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

## Generate Embeddings

In [None]:
text_chunk = chunk_df['Chunk text'].tolist()
embeddings = model.encode(text_chunk, batch_size=64, show_progress_bar=True)