# Task 2: Text Chunking, Embedding, and Vector Store Indexing

In [4]:
# %pip install faiss-cpu

In [6]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import os
import pickle
import sys
sys.path.append(os.path.abspath("../src"))
import warnings
warnings.filterwarnings('ignore')

In [8]:
# %%
os.chdir("..")  # Go up a directory
#print(os.getcwd())

In [7]:
from Embeding_Index import *

In [8]:
# Access Function
def run_pipeline(input_path="Data/filtered_complaints.csv", chunk_size=300, chunk_overlap=50):
    df = load_cleaned_data(input_path)
    if df is None:
        return

    print(" Chunking text...")
    texts, metadata = chunk_texts(df, chunk_size, chunk_overlap)

    print(" Generating embeddings...")
    embeddings = generate_embeddings(texts)

    print("Indexing to FAISS...")
    index_with_faiss(embeddings, metadata)

if __name__ == "__main__":
    run_pipeline()

 Chunking text...
 Generating embeddings...


Batches: 100%|██████████| 45/45 [00:25<00:00,  1.79it/s]

Indexing to FAISS...
Vector store saved in vector_store/faiss_index





In [2]:
def chunk_texts(df, column='cleaned_narrative', chunk_size=500, chunk_overlap=100):
    """
    Splits text into overlapping chunks for better embeddings.
    Returns list of chunks and their metadata.
    """
    try:
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        texts, metadata = [], []

        for idx, row in df.iterrows():
            if pd.notna(row[column]):
                chunks = splitter.split_text(row[column])
                for chunk in chunks:
                    texts.append(chunk)
                    metadata.append({
                        "complaint_id": row["Complaint ID"] if "Complaint ID" in row else idx,
                        "product": row.get("Product", "Unknown")
                    })

        print(f"Chunked {len(texts)} text segments from {len(df)} records.")
        return texts, metadata

    except Exception as e:
        print(f"Error in chunking: {e}")
        return [], []

In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json

def embed_and_index(texts, metadata, model_name="all-MiniLM-L6-v2"):
    """
    Embeds text chunks using a SentenceTransformer and stores in FAISS.
    Returns FAISS index and metadata list.
    """
    try:
        print("Loading embedding model...")
        model = SentenceTransformer(model_name)
        embeddings = model.encode(texts, show_progress_bar=True)

        dim = embeddings[0].shape[0]
        index = faiss.IndexFlatL2(dim)
        index.add(np.array(embeddings))

        print(f"Embedded and indexed {len(texts)} text chunks.")
        return index, metadata

    except Exception as e:
        print(f" Error in embedding/indexing: {e}")
        return None, []

In [9]:
# Load cleaned data (make sure it's already filtered/cleaned)
df_cleaned = pd.read_csv("Data/filtered_complaints.csv")
# Step 1: Chunk narratives
texts, meta = chunk_texts(df_cleaned)

# Step 2: Embed and index
faiss_index, metadata = embed_and_index(texts, meta)

Chunked 924 text segments from 286 records.
Loading embedding model...


Batches: 100%|██████████| 29/29 [00:08<00:00,  3.28it/s]

Embedded and indexed 924 text chunks.





In [10]:
# Save FAISS index
faiss.write_index(faiss_index, "vector_store/faiss_index.index")

# Save metadata
with open("vector_store/metadata.json", "w") as f:
    json.dump(metadata, f)

print("✅ Vector store saved.")


✅ Vector store saved.
