In [2]:
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle

# Load cleaned data

In [6]:
df = pd.read_csv('../data/processed/filtered_complaints.csv')


### Chunking strategy

In [7]:
# Chunking strategy
chunk_size = 300  # Experimented and found a balance between context and granularity
chunk_overlap = 50

splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

# Prepare chunks and metadata
chunks = []
metadatas = []
for idx, row in df.iterrows():
    text = str(row['cleaned_narrative'])
    splits = splitter.split_text(text)
    for i, chunk in enumerate(splits):
        chunks.append(chunk)
        metadatas.append({
            "complaint_id": idx,
            "product": row['Product'],
            "chunk_index": i
        })

### Embedding model

In [8]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)
embeddings = embedder.encode(chunks, show_progress_bar=True, convert_to_numpy=True)

Batches: 100%|██████████| 256/256 [01:09<00:00,  3.70it/s]


### Build FAISS index

In [9]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

#### Persist vector store and metadata

In [10]:
os.makedirs("vector_store", exist_ok=True)
faiss.write_index(index, "vector_store/complaints_faiss.index")
with open("vector_store/metadata.pkl", "wb") as f:
    pickle.dump(metadatas, f)

print(f"Indexed {len(chunks)} chunks. Vector store saved in vector_store/")

Indexed 8180 chunks. Vector store saved in vector_store/


## Text Chunking and Embedding Strategy

To enable efficient semantic search, we split each cleaned complaint narrative into overlapping text chunks using LangChain's `RecursiveCharacterTextSplitter`. After experimenting with different parameters, we chose a `chunk_size` of 300 characters and a `chunk_overlap` of 50. This balance preserves context within each chunk while ensuring long narratives are not truncated, and short ones are not split unnecessarily.

For embedding, we used the `sentence-transformers/all-MiniLM-L6-v2` model. This model is lightweight, fast, and provides strong performance for semantic similarity tasks, making it well-suited for large-scale complaint datasets. Each chunk's embedding is stored in a FAISS vector store, along with metadata (complaint ID, product, chunk index) to enable traceability from search results back to the original complaint.