# Task-2

### Text Chunking, Embedding, and Faiss Indexing

In [1]:
import sys
import os

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join("..")))

from src.config import Config
from src.sampling import stratified_sample
from src.chunking import chunk_texts
from src.embeddings import embed_texts
from src.faiss_store import build_faiss_index, persist_faiss
from src.utils import ensure_dir, validate_narratives

import pandas as pd
import numpy as np


  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/processed/filtered_complaints.csv")

validate_narratives(df, "Consumer complaint narrative")


In [7]:
from src.config import Config
from src.utils import ensure_dir
import src.config
ensure_dir(Config.PROCESSED_DIR)
sampled_df = stratified_sample(
    df=df,
    label_col="Product",
    sample_size=Config.SAMPLE_SIZE,
    random_state=Config.RANDOM_SEED
)

sampled_df.to_csv(
    f"{Config.PROCESSED_DIR}/sampled_complaints.csv",
    index=False
)

In [8]:
chunks = []
metadata = []

for idx, row in sampled_df.iterrows():
    text_chunks = chunk_texts(
        row["Consumer complaint narrative"],
        Config.CHUNK_SIZE,
        Config.CHUNK_OVERLAP
    )

    for i, chunk in enumerate(text_chunks):
        chunks.append(chunk)
        metadata.append({
            "complaint_id": idx,
            "product": row["Product"],
            "chunk_index": i
        })

In [9]:
embeddings = embed_texts(
    chunks,
    Config.EMBEDDING_MODEL
)

np.save(f"{Config.PROCESSED_DIR}/embeddings.npy", embeddings)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Batches: 100%|██████████| 1274/1274 [20:33<00:00,  1.03it/s]


In [10]:
index = build_faiss_index(embeddings)

persist_faiss(
    index=index,
    metadata=metadata,
    path=Config.FAISS_DIR
)
