In [2]:
import pandas as pd
import numpy as np
from langchain_text_splitters import RecursiveCharacterTextSplitter
df = pd.read_csv("../data/processed/filtered_complaints.csv")
df.shape

(462436, 20)

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len
)

In [4]:
df.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID',
       'narrative_word_count', 'clean_narrative'],
      dtype='object')

In [5]:
product_mapping = {
    "Credit card": "Credit Cards",
    "Credit card or prepaid card": "Credit Cards",
    "Consumer Loan": "Personal Loans",
    "Payday loan, title loan, or personal loan": "Personal Loans",
    "Payday loan, title loan, personal loan, or advance loan": "Personal Loans",
    "Checking or savings account": "Savings Accounts",
    "Money transfer, virtual currency, or money service": "Money Transfers"
}

df["product_category"] = df["Product"].map(product_mapping)

In [6]:
chunks = []
for _, row in df.iterrows():
    if isinstance(row["clean_narrative"], str):
        split_texts = text_splitter.split_text(row["clean_narrative"])
        for text in split_texts:
            chunks.append({
                "text": text,
                "product_category": row["product_category"]
            })
len(chunks)

1319180

In [7]:
chunks_df = pd.DataFrame(chunks)
chunks_df.head()

Unnamed: 0,text,product_category
0,a xxxx xxxx card was opened under my name by a...,Credit Cards
1,i made the mistake of using my wellsfargo debi...,Savings Accounts
2,my dispute was rejected i went back into xxxx ...,Savings Accounts
3,dear cfpb i have a secured credit card with ci...,Credit Cards
4,wait for some form that might be sent to me vi...,Credit Cards


In [8]:
chunks_df["product_category"].value_counts()

product_category
Credit Cards        551189
Savings Accounts    421684
Money Transfers     246309
Personal Loans       99998
Name: count, dtype: int64

In [9]:
sample_size = 12000
sample_df = (
    df.groupby("product_category", group_keys=False)
      .apply(
          lambda x: x.sample(
              n=int(sample_size * len(x) / len(df)),
              random_state=42
          ),
          include_groups=False
      )
)
sample_df.shape

(11998, 20)

In [10]:
product_mapping = {
    "Credit card": "Credit Cards",
    "Credit card or prepaid card": "Credit Cards",
    "Consumer Loan": "Personal Loans",
    "Payday loan, title loan, or personal loan": "Personal Loans",
    "Payday loan, title loan, personal loan, or advance loan": "Personal Loans",
    "Checking or savings account": "Savings Accounts",
    "Money transfer, virtual currency, or money service": "Money Transfers"
}

df["product_category"] = df["Product"].map(product_mapping)

In [11]:
df["product_category"].value_counts()

product_category
Credit Cards        189334
Savings Accounts    140319
Money Transfers      97188
Personal Loans       35595
Name: count, dtype: int64

In [12]:
sample_size = 0.03
sample_df = (
    df.groupby("product_category", group_keys=False)
      .apply(lambda x: x.sample(
          n=int(sample_size * len(x)),
          random_state=42
      ))
)
sample_df.shape

  .apply(lambda x: x.sample(


(13871, 21)

In [13]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len
)
chunks = []
for _, row in sample_df.iterrows():
    split_texts = text_splitter.split_text(row["clean_narrative"])
    for i, text in enumerate(split_texts):
        chunks.append({
            "text": text,
            "product_category": row["product_category"],
            "complaint_id": row["Complaint ID"],
            "chunk_index": i,
            "total_chunks": len(split_texts)
        })
sample_chunks_df = pd.DataFrame(chunks)
sample_chunks_df.head()

Unnamed: 0,text,product_category,complaint_id,chunk_index,total_chunks
0,my soon to be ex wife took out several credit ...,Credit Cards,7471816,0,1
1,after informing bank of america of my identity...,Credit Cards,3952712,0,1
2,i am deeply troubled by the inclusion of this ...,Credit Cards,9669839,0,1
3,on xx xx xxxx i reached out to discover bank t...,Credit Cards,7420081,0,23
4,positive balance since i had no balance due on...,Credit Cards,7420081,1,23


In [14]:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')
sample_chunks_df['embedding'] = sample_chunks_df['text'].apply(lambda x: model.encode(x))

In [15]:
sample_chunks_df['embedding'].iloc[0].shape

(384,)

In [16]:
embeddings = np.vstack(sample_chunks_df['embedding'].values)
embeddings.shape

(39577, 384)

In [17]:
query = "unauthorized credit card charge"
query_embedding = model.encode(query)
from sklearn.metrics.pairwise import cosine_similarity
scores = cosine_similarity([query_embedding], embeddings)[0]
top_k = 5
top_indices = scores.argsort()[-top_k:][::-1]
sample_chunks_df.iloc[top_indices][
    ['text', 'product_category', 'complaint_id']
]

Unnamed: 0,text,product_category,complaint_id
14697,i do not use the card i did not notice that th...,Credit Cards,2851514
8079,an unauthorized charge from xxxxxxxx xxxx xxxx...,Credit Cards,8012052
10964,charge and i refuse to use their credit card e...,Credit Cards,9920626
7447,xx xx xxxx xx xx xxxx i handed the card over t...,Credit Cards,3374042
9211,recognize all the other charges you don t need...,Credit Cards,7912467


In [18]:
import pickle
with open("sample_embeddings.pkl", "wb") as f:
    pickle.dump(sample_chunks_df, f)

In [19]:
import faiss
import numpy as np
import os
os.makedirs("../vector_store", exist_ok=True)
embeddings = np.vstack(sample_chunks_df["embedding"].values).astype("float32")
dim = embeddings.shape[1] 
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
print("Total vectors indexed:", index.ntotal)

Total vectors indexed: 39577


In [20]:
faiss.write_index(index, "../vector_store/sample_faiss.index")
sample_chunks_df.drop(columns=["embedding"]).to_parquet(
    "../vector_store/sample_metadata.parquet",
    index=False
)
print("Vector store saved successfully")

Vector store saved successfully
