# Task 2: Text Chunking, Embedding, and Vector Store Indexing
Splits narratives from `filtered_complaints.csv` into chunks, generates embeddings using `sentence-transformers/all-MiniLM-L6-v2`, and stores them in a FAISS vector database for Task 3 retrieval.

In [None]:
# Cell 1: Setup and load data
import pandas as pd
import os

# Paths
input_file = '../data/processed/filtered_complaints.csv'
chunks_file = '../data/processed/complaint_chunks.csv'

# Verify input file
if not os.path.exists(input_file):
    raise FileNotFoundError(f"File not found at: {os.path.abspath(input_file)}")

# Load filtered dataset
print("Loading filtered complaints...")
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} complaints with columns: {df.columns.tolist()}")

# Verify required columns
required_columns = ['Complaint ID', 'Product', 'Consumer complaint narrative']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing columns: {missing_columns}")

In [None]:
# Cell 2: Chunk narratives
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Adjust based on Task 1 narrative length stats
    chunk_overlap=50,
    length_function=lambda x: len(x.split())
)

# Split narratives
print("Chunking narratives...")
chunks = []
for idx, row in df.iterrows():
    splits = splitter.split_text(row['Consumer complaint narrative'])
    for split in splits:
        chunks.append({
            'complaint_id': row['Complaint ID'],
            'product': row['Product'],
            'chunk': split
        })

# Save chunks
df_chunks = pd.DataFrame(chunks)
os.makedirs('../data/processed', exist_ok=True)
df_chunks.to_csv(chunks_file, index=False)
print(f"Created {len(df_chunks)} chunks, saved to {chunks_file}")

In [None]:
# Cell 3: Generate embeddings
from sentence_transformers import SentenceTransformer
import numpy as np

# Load chunks
df_chunks = pd.read_csv(chunks_file)

# Initialize model
print("Generating embeddings...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df_chunks['chunk'].tolist(), batch_size=32, show_progress_bar=True)
df_chunks['embedding'] = embeddings.tolist()
print(f"Generated embeddings for {len(df_chunks)} chunks")

In [None]:
# Cell 4: Create and save FAISS index
import faiss

# Paths
index_file = '../vector_store/faiss_index.bin'
metadata_file = '../vector_store/metadata.csv'

# Convert embeddings to numpy
embeddings_np = np.array(df_chunks['embedding'].tolist(), dtype=np.float32)
dimension = embeddings_np.shape[1]  # 384 for all-MiniLM-L6-v2

# Create FAISS index
print("Building FAISS index...")
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)

# Save index and metadata
os.makedirs('../vector_store', exist_ok=True)
faiss.write_index(index, index_file)
df_chunks[['complaint_id', 'product', 'chunk']].to_csv(metadata_file, index=False)
print(f"Saved FAISS index with {index.ntotal} vectors to {index_file}")
print(f"Saved metadata to {metadata_file}")

In [None]:
# Cell 5: Verify FAISS index
index = faiss.read_index(index_file)
print(f"Verified FAISS index: {index.ntotal} vectors, dimension {index.d}")
metadata = pd.read_csv(metadata_file)
print(f"Verified metadata: {len(metadata)} rows with columns {metadata.columns.tolist()}")