In [1]:
# 02_chunking_embedding_indexing.py
# Chunking, Embeddings, and Vector Store Indexing for CrediTrust Complaints

# -----------------------------
# 1. Imports
# -----------------------------
import pandas as pd
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import pickle
from tqdm import tqdm

# -----------------------------
# 2. Load preprocessed dataset
# -----------------------------
processed_file = "../data/processed/filtered_complaints.csv"  # ✅ path fixed for src folder
df = pd.read_csv(processed_file)
print(f"Loaded preprocessed dataset: {df.shape}")

# -----------------------------
# 3. Text chunking
# -----------------------------
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].fillna('').astype(str)

# Configure text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,      # reasonable size for semantic chunks
    chunk_overlap=50
)

chunks = []
metadata = []

print("Splitting narratives into chunks...")
for idx, row in tqdm(df.iterrows(), total=len(df)):
    narrative = row['Consumer complaint narrative'].strip()
    if not narrative:
        continue

    split_texts = text_splitter.split_text(narrative)
    chunks.extend(split_texts)

    for _ in split_texts:
        metadata.append({
            'complaint_id': row['Complaint ID'],
            'product': row['Product']
        })

print(f"✅ Total rows: {len(df)}, Total chunks created: {len(chunks)}")

# -----------------------------
# 4. Embeddings
# -----------------------------
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

print("Generating embeddings in batches...")
embeddings = embedder.encode(chunks, show_progress_bar=True, batch_size=64)
print(f"✅ Embeddings shape: {embeddings.shape}")

# -----------------------------
# 5. FAISS vector store
# -----------------------------
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)
print(f"✅ FAISS index contains {index.ntotal} vectors")

# -----------------------------
# 6. Save FAISS index and metadata
# -----------------------------
os.makedirs("vector_store", exist_ok=True)

faiss.write_index(index, "vector_store/complaints_faiss.index")
with open("vector_store/metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

print("FAISS index and metadata saved to vector_store/")


Loaded preprocessed dataset: (82164, 19)
Splitting narratives into chunks...


100%|██████████| 82164/82164 [00:35<00:00, 2318.45it/s]


✅ Total rows: 82164, Total chunks created: 422805
Generating embeddings in batches...


Batches:   0%|          | 0/6607 [00:00<?, ?it/s]

✅ Embeddings shape: (422805, 384)
✅ FAISS index contains 422805 vectors
FAISS index and metadata saved to vector_store/
