In [4]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import pickle


In [None]:
# === Load Filtered Data ===
df = pd.read_csv("../data/filtered_complaints.csv")

# === Chunking Strategy ===
chunk_size = 500
chunk_overlap = 100

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
)

# Prepare chunks and metadata
texts = []
metadatas = []

for _, row in df.iterrows():
    chunks = text_splitter.split_text(row["cleaned_narrative"])
    for chunk in chunks:
        texts.append(chunk)
        metadatas.append({
            "complaint_id": row["Complaint ID"],
            "product": row["Product"]
        })

print(f"Total chunks created: {len(texts)}")

# === Embedding Model ===
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

# Generate embeddings
embeddings = model.encode(texts, show_progress_bar=True)

# === FAISS Indexing ===
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

# Save FAISS index and metadata
os.makedirs("..data/vector_store", exist_ok=True)
faiss.write_index(index, "..data/vector_store/faiss_index.index")

# Save metadata mapping
with open("..data/vector_store/metadata.pkl", "wb") as f:
    pickle.dump(metadatas, f)

print("Vector store and metadata saved to ../vector_store/")


Total chunks created: 501679


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/15678 [00:00<?, ?it/s]