In [2]:
!pip install tqdm langchain sentence-transformers

Collecting langchain
  Using cached langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting langchain-core<1.0.0,>=0.3.66 (from langchain)
  Using cached langchain_core-0.3.68-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Using cached langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith>=0.1.17 (from langchain)
  Using cached langsmith-0.4.4-py3-none-any.whl.metadata (15 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith>=0.1.17->langchain)
  Using cached orjson-3.10.18-cp312-cp312-win_amd64.whl.metadata (43 kB)
Collecting sympy==1.13.1 (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Using cached langchain-0.3.26-py3-none-any.whl (1.0 MB)
Using cached sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
Using cached langc

In [None]:
import os
import pandas as pd
import numpy as np
import pickle
import torch
import faiss

from tqdm.notebook import tqd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

In [6]:
DATA_PATH = "../data/filtered_complaints.csv"
VECTOR_STORE_DIR = "../vector_store"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 250
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
BATCH_SIZE = 32

In [8]:
# -------------------------
# Load data
# -------------------------
print("[+] Loading dataset...")
df = pd.read_csv(DATA_PATH)
assert "cleaned_narrative" in df.columns, "Missing 'cleaned_narrative' column"

# -------------------------
# Initialize components
# -------------------------
print("[+] Initializing text splitter and model...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

device = 'cpu'
print(f"[+] Using device: {device}")

model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=device)

# -------------------------
# Chunk narratives and collect metadata
# -------------------------
print("[+] Chunking text and collecting metadata...")
all_chunks = []
metadata_list = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    complaint_id = row.get("Complaint ID", idx)
    product = row["Product"]
    text = row["cleaned_narrative"]

    chunks = text_splitter.split_text(text)
    for chunk in chunks:
        all_chunks.append(chunk)
        metadata_list.append({
            "complaint_id": complaint_id,
            "product": product,
            "text": chunk
        })

print(f"[+] Total chunks: {len(all_chunks)}")

# -------------------------
# Generate embeddings (batch, GPU)
# -------------------------
print("[+] Generating embeddings...")
embeddings = model.encode(
    all_chunks,
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    convert_to_numpy=True,
    device=device
)

# -------------------------
# Build FAISS index
# -------------------------
print("[+] Building FAISS index...")
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

# -------------------------
# Save index and metadata
# -------------------------
os.makedirs(VECTOR_STORE_DIR, exist_ok=True)

faiss.write_index(index, os.path.join(VECTOR_STORE_DIR, "faiss_index.bin"))
with open(os.path.join(VECTOR_STORE_DIR, "metadata.pkl"), "wb") as f:
    pickle.dump(metadata_list, f)

print(f"[✓] Saved FAISS index and metadata for {len(embeddings)} chunks.")

[+] Loading dataset...
[+] Initializing text splitter and model...
[+] Using device: cpu
[+] Chunking text and collecting metadata...


100%|███████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 554.71it/s]

[+] Total chunks: 691
[+] Generating embeddings...





Batches:   0%|          | 0/22 [00:00<?, ?it/s]

[+] Building FAISS index...
[✓] Saved FAISS index and metadata for 691 chunks.
