In [3]:
# 📦 Imports
import pandas as pd
from tqdm import tqdm
import os
import faiss
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import pickle

# 📁 Paths and directory setup
# Example if the file is in a folder named 'data' inside your project folder:
DATA_PATH = "../data/filtered_complaints.csv"
VECTOR_STORE_DIR = "./vector_store/"
os.makedirs(VECTOR_STORE_DIR, exist_ok=True)




In [4]:
# Load the cleaned complaints CSV into a DataFrame
df = pd.read_csv(DATA_PATH)

print(f"Loaded {len(df)} complaints.")

# Prepare a list of documents with relevant fields (ID, product, text)
documents = []
for _, row in df.iterrows():
    text = str(row['Cleaned Narrative']) if pd.notna(row['Cleaned Narrative']) else ""
    documents.append({
        "id": row['Complaint ID'],
        "product": row['Product'],
        "text": text
    })

print(f"Prepared {len(documents)} documents for chunking.")


Loaded 344308 complaints.
Prepared 344308 documents for chunking.


In [37]:
# Limit to first 1000 documents and remove short text
documents_subset = [doc for doc in documents[:1000] if len(doc["text"].strip()) > 100]
print(f"Filtered documents: {len(documents_subset)}")


Filtered documents: 988


In [46]:
chunk_size = 10000  
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

chunked_docs = []

for doc in documents_subset:
    chunks = text_splitter.split_text(doc['text'])
    for i, chunk in enumerate(chunks):
        chunked_docs.append({
            'id': doc['id'],
            'product': doc['product'],
            'chunk_text': chunk,
            'chunk_index': i
        })

print(f"Total chunks created: {len(chunked_docs)} with chunk_size={chunk_size}")


Total chunks created: 989 with chunk_size=10000


In [39]:
print("Estimating embedding time...")
chunk_count = len(chunked_docs)
avg_per_chunk = 0.025  # ~25ms per chunk (MiniLM L6)

estimated_time_min = (chunk_count * avg_per_chunk) / 60
print(f"Total chunks: {chunk_count} ⏱️ Estimated time: {estimated_time_min:.2f} minutes")


Estimating embedding time...
Total chunks: 989 ⏱️ Estimated time: 0.41 minutes


In [33]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", local_files_only=True)
print(f"Loaded embedding model: {model_name}")


Loaded embedding model: sentence-transformers/all-MiniLM-L6-v2


In [42]:
# Extract texts from the chunked documents
chunk_texts = [doc["chunk_text"] for doc in chunked_docs]

# Encode chunks with progress bar and batching
embeddings = embedding_model.encode(
    chunk_texts,
    show_progress_bar=True,
    batch_size=64
)

# Convert to float32 NumPy array
embeddings = np.array(embeddings).astype("float32")

# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)

print(f"Generated and normalized embeddings for {len(embeddings)} chunks.")


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Generated and normalized embeddings for 989 chunks.


In [43]:
# Get embedding dimension (e.g., 384 for MiniLM-L6-v2)
dimension = embeddings.shape[1]

# Create FAISS index for cosine similarity (via inner product)
index = faiss.IndexFlatIP(dimension)

# Add all chunk embeddings to the index
index.add(embeddings)

print(f"FAISS index created and populated with {index.ntotal} vectors.")


FAISS index created and populated with 989 vectors.


In [44]:
# Save the FAISS index to disk
faiss_index_path = os.path.join(VECTOR_STORE_DIR, "faiss_index.bin")
faiss.write_index(index, faiss_index_path)

# Save the chunked metadata (so you can trace results later)
chunk_metadata_path = os.path.join(VECTOR_STORE_DIR, "chunked_docs.pkl")
with open(chunk_metadata_path, "wb") as f:
    pickle.dump(chunked_docs, f)

print(f"Saved FAISS index to: {faiss_index_path}")
print(f"Saved chunk metadata to: {chunk_metadata_path}")


Saved FAISS index to: ./vector_store/faiss_index.bin
Saved chunk metadata to: ./vector_store/chunked_docs.pkl


In [45]:
# chunked_docs is a list of dictionaries, convert it to a DataFrame and save as CSV

pd.DataFrame(chunked_docs).to_csv(os.path.join(VECTOR_STORE_DIR, "chunked_metadata.csv"), index=False)


# Semantic Search with Text Chunking, Embedding, and FAISS Vector Index

This code demonstrates how to perform efficient semantic search over a large collection of text complaints by:

1. **Loading the Pre-built Vector Store and Metadata**  
   - The FAISS vector index (`faiss_index.bin`) containing vector embeddings of text chunks is loaded.  
   - The metadata for each chunk (complaint ID, product category, chunk text, etc.) is loaded from a saved pickle file and converted into a DataFrame for easy access.

2. **Loading the Embedding Model**  
   - We load the pre-trained `sentence-transformers/all-MiniLM-L6-v2` model, which is a lightweight and effective model for encoding text into dense vector representations suitable for semantic similarity tasks.

3. **Defining the Semantic Search Function**  
   - Given a user query (a short text), the function encodes it into a vector using the embedding model.  
   - The query vector is normalized to match the FAISS index's format.  
   - The FAISS index is queried to find the top-k most similar text chunks by comparing vector distances.  
   - For each closest match, the function retrieves the corresponding metadata, including the original complaint ID, product category, chunk index, and a snippet of the chunk text, along with the similarity score.

4. **Running a Sample Query**  
   - We run a sample query ("fraudulent charge on credit card") to demonstrate how the semantic search returns relevant complaint chunks.  
   - The top results are printed with key metadata and a snippet of the matching text.

---

### Summary

- This pipeline enables fast and scalable semantic search on large text corpora by splitting long narratives into manageable chunks, embedding them into vector space, and indexing with FAISS for quick similarity retrieval.  
- Metadata stored alongside embeddings ensures that search results can be traced back to their original source documents for interpretability.  
- The sentence-transformers model provides a balance of speed and accuracy for embedding generation, making it suitable for production-scale applications.



In [52]:
import os
import pickle
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Directory where vector store and metadata are saved
VECTOR_STORE_DIR = "../vector_store/"


In [53]:
faiss_index_path = os.path.join(VECTOR_STORE_DIR, "faiss_index.bin")
index = faiss.read_index(faiss_index_path)
print(f"Loaded FAISS index with {index.ntotal} vectors.")


Loaded FAISS index with 989 vectors.


In [54]:
metadata_path = os.path.join(VECTOR_STORE_DIR, "chunked_docs.pkl")

with open(metadata_path, "rb") as f:
    chunked_docs = pickle.load(f)

metadata_df = pd.DataFrame(chunked_docs)
print(f"Loaded metadata with {len(metadata_df)} chunks.")


Loaded metadata with 989 chunks.


In [55]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(model_name)
print(f"Loaded embedding model: {model_name}")


Loaded embedding model: sentence-transformers/all-MiniLM-L6-v2


In [56]:
def semantic_search(query, top_k=5):
    # Embed the query to a vector
    query_vec = embedding_model.encode([query])
    query_vec = np.array(query_vec).astype("float32")
    
    # Normalize the query vector (since FAISS index is normalized)
    faiss.normalize_L2(query_vec)
    
    # Search the FAISS index for the closest vectors
    distances, indices = index.search(query_vec, top_k)
    
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        if idx == -1:
            continue
        row = metadata_df.iloc[idx]
        results.append({
            "id": row['id'],
            "product": row['product'],
            "chunk_index": row['chunk_index'],
            "chunk_text": row['chunk_text'],
            "score": dist
        })
    return results


In [57]:
query = "fraudulent charge on credit card"
results = semantic_search(query)

print(f"Top {len(results)} results for query: '{query}'\n")

for i, res in enumerate(results, 1):
    print(f"{i}. Complaint ID: {res['id']}, Product: {res['product']}, Chunk index: {res['chunk_index']}, Similarity score: {res['score']:.4f}")
    print(f"   Text snippet: {res['chunk_text'][:200]}...\n")


Top 5 results for query: 'fraudulent charge on credit card'

1. Complaint ID: 13885761, Product: Credit card, Chunk index: 0, Similarity score: 0.7013
   Text snippet: on xxxxyear i made rented a car in the xxxx xxxx company name xxxx xxxx xxxx originally i was going to put the charge of my card but they offered a substantial discount if i paid in cash they informed...

2. Complaint ID: 13993418, Product: Credit card, Chunk index: 0, Similarity score: 0.6777
   Text snippet: xxxx credit card has charged me exponentially more than what my original borrow request was for after trying to resolve the situation i informed them that if they did not remove the outrageous interes...

3. Complaint ID: 13575993, Product: Credit card, Chunk index: 0, Similarity score: 0.6647
   Text snippet: on xxxxscrub 2024 i incurred a fraudulent charge on my citibank xxxx xxxx xxxx ending in xxxx for 91000 from xxxx xxxx the fraud was immediately reported and a new card was issued despite my numerous ...

4. 