# Task 2: Text Chunking, Embedding, and Vector Store Indexing
## Intelligent Complaint Analysis for Financial Services

**Objective:** Convert cleaned text narratives into a format suitable for efficient semantic search by creating embeddings and building a vector store.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
from typing import List, Dict
from tqdm import tqdm

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Sentence Transformers for embeddings
from sentence_transformers import SentenceTransformer

# Vector stores
import chromadb
from chromadb.config import Settings

import warnings
warnings.filterwarnings('ignore')

## 1. Load Cleaned Dataset

In [None]:
# Load the filtered and cleaned dataset from Task 1
data_path = Path('../data/processed/filtered_complaints.csv')
print(f"Loading cleaned data from: {data_path}")

df = pd.read_csv(data_path)
print(f"\nDataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()

## 2. Create Stratified Sample

We'll create a stratified sample of 10,000-15,000 complaints ensuring proportional representation across all product categories.

In [None]:
# Check product distribution
print("Product distribution in full dataset:")
product_dist = df['Product'].value_counts()
print(product_dist)
print(f"\nTotal complaints: {len(df):,}")

In [None]:
# Set sample size
SAMPLE_SIZE = 12000  # Target sample size

# Calculate sampling fraction
sampling_fraction = min(SAMPLE_SIZE / len(df), 1.0)

print(f"Target sample size: {SAMPLE_SIZE:,}")
print(f"Sampling fraction: {sampling_fraction:.4f}")

# Stratified sampling by Product
df_sample = df.groupby('Product', group_keys=False).apply(
    lambda x: x.sample(frac=sampling_fraction, random_state=42)
).reset_index(drop=True)

print(f"\nActual sample size: {len(df_sample):,}")

In [None]:
# Verify stratification
print("\nProduct distribution in sample:")
sample_dist = df_sample['Product'].value_counts()
print(sample_dist)

# Compare proportions
comparison = pd.DataFrame({
    'Full Dataset': (product_dist / len(df) * 100).round(2),
    'Sample': (sample_dist / len(df_sample) * 100).round(2)
})
print("\nProportion comparison (%):") 
print(comparison)

## 3. Text Chunking Strategy

We'll use `RecursiveCharacterTextSplitter` to split long narratives into manageable chunks.

**Rationale for chunk parameters:**
- **Chunk size: 500 characters** - Balances context preservation with embedding quality
- **Chunk overlap: 50 characters** - Ensures continuity between chunks and prevents loss of context at boundaries
- **Separators:** Prioritize paragraph and sentence boundaries for semantic coherence

In [None]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

print("Text Splitter Configuration:")
print(f"  Chunk size: 500 characters")
print(f"  Chunk overlap: 50 characters")
print(f"  Separators: ['\\n\\n', '\\n', '. ', ' ', '']")

In [None]:
# Test chunking on a sample narrative
sample_narrative = df_sample['cleaned_narrative'].iloc[0]
print("Sample narrative (first 500 chars):")
print(sample_narrative[:500])
print(f"\nFull length: {len(sample_narrative)} characters")

# Split the sample
chunks = text_splitter.split_text(sample_narrative)
print(f"\nNumber of chunks: {len(chunks)}")
print("\nFirst chunk:")
print(chunks[0])
if len(chunks) > 1:
    print("\nSecond chunk:")
    print(chunks[1])

In [None]:
# Create chunks for all complaints in the sample
def create_chunks_with_metadata(df: pd.DataFrame) -> List[Dict]:
    """
    Create text chunks with associated metadata.
    
    Returns:
        List of dictionaries containing chunk text and metadata
    """
    all_chunks = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Creating chunks"):
        narrative = row['cleaned_narrative']
        
        # Skip if narrative is empty
        if not narrative or len(str(narrative).strip()) == 0:
            continue
        
        # Split into chunks
        chunks = text_splitter.split_text(str(narrative))
        
        # Create metadata for each chunk
        for chunk_idx, chunk in enumerate(chunks):
            chunk_data = {
                'text': chunk,
                'complaint_id': str(row['complaint_id']),
                'product': row['Product'],
                'issue': row.get('Issue', 'Unknown'),
                'sub_issue': row.get('Sub-issue', 'Unknown'),
                'company': row.get('Company', 'Unknown'),
                'state': row.get('State', 'Unknown'),
                'date_received': str(row.get('Date received', 'Unknown')),
                'chunk_index': chunk_idx,
                'total_chunks': len(chunks)
            }
            all_chunks.append(chunk_data)
    
    return all_chunks

print("Creating chunks for all complaints...")
chunks_data = create_chunks_with_metadata(df_sample)
print(f"\n✅ Created {len(chunks_data):,} chunks from {len(df_sample):,} complaints")
print(f"Average chunks per complaint: {len(chunks_data)/len(df_sample):.2f}")

In [None]:
# Analyze chunk statistics
chunk_lengths = [len(chunk['text']) for chunk in chunks_data]
print("Chunk length statistics (characters):")
print(f"  Mean: {np.mean(chunk_lengths):.2f}")
print(f"  Median: {np.median(chunk_lengths):.2f}")
print(f"  Min: {np.min(chunk_lengths)}")
print(f"  Max: {np.max(chunk_lengths)}")

# Sample chunk
print("\nSample chunk with metadata:")
print(chunks_data[0])

## 4. Embedding Model Selection

**Model:** `sentence-transformers/all-MiniLM-L6-v2`

**Rationale:**
- Lightweight and efficient (80MB)
- 384-dimensional embeddings (good balance of performance and size)
- Trained on a large corpus for semantic similarity tasks
- Fast inference time suitable for large-scale embedding generation
- Well-suited for semantic search applications

In [None]:
# Load embedding model
print("Loading embedding model: all-MiniLM-L6-v2")
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

print(f"\n✅ Model loaded successfully!")
print(f"Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")

In [None]:
# Test embedding generation
test_text = "I have a problem with my credit card billing."
test_embedding = embedding_model.encode(test_text)

print(f"Test text: {test_text}")
print(f"Embedding shape: {test_embedding.shape}")
print(f"Embedding (first 10 values): {test_embedding[:10]}")

## 5. Generate Embeddings

In [None]:
# Extract texts for embedding
chunk_texts = [chunk['text'] for chunk in chunks_data]

print(f"Generating embeddings for {len(chunk_texts):,} chunks...")
print("This may take a few minutes...")

# Generate embeddings in batches for efficiency
batch_size = 32
embeddings = embedding_model.encode(
    chunk_texts,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"\n✅ Generated {len(embeddings):,} embeddings")
print(f"Embeddings shape: {embeddings.shape}")

## 6. Build Vector Store with ChromaDB

In [None]:
# Initialize ChromaDB client
vector_store_path = Path('../vector_store')
vector_store_path.mkdir(parents=True, exist_ok=True)

print(f"Initializing ChromaDB at: {vector_store_path}")

chroma_client = chromadb.PersistentClient(
    path=str(vector_store_path)
)

print("✅ ChromaDB client initialized")

In [None]:
# Create or get collection
collection_name = "complaint_embeddings_sample"

# Delete collection if it exists (for clean start)
try:
    chroma_client.delete_collection(name=collection_name)
    print(f"Deleted existing collection: {collection_name}")
except:
    pass

# Create new collection
collection = chroma_client.create_collection(
    name=collection_name,
    metadata={"description": "Complaint narratives embeddings for RAG chatbot"}
)

print(f"✅ Created collection: {collection_name}")

In [None]:
# Prepare data for ChromaDB
print("Preparing data for vector store...")

# Create unique IDs for each chunk
ids = [f"chunk_{i}" for i in range(len(chunks_data))]

# Prepare metadata (ChromaDB requires string values)
metadatas = []
for chunk in chunks_data:
    metadata = {
        'complaint_id': str(chunk['complaint_id']),
        'product': str(chunk['product']),
        'issue': str(chunk['issue']),
        'sub_issue': str(chunk['sub_issue']),
        'company': str(chunk['company']),
        'state': str(chunk['state']),
        'date_received': str(chunk['date_received']),
        'chunk_index': str(chunk['chunk_index']),
        'total_chunks': str(chunk['total_chunks'])
    }
    metadatas.append(metadata)

# Documents (the actual text)
documents = chunk_texts

print(f"Prepared {len(ids):,} items for indexing")

In [None]:
# Add to ChromaDB in batches
print("Adding embeddings to ChromaDB...")

batch_size = 1000
for i in tqdm(range(0, len(ids), batch_size), desc="Indexing batches"):
    batch_end = min(i + batch_size, len(ids))
    
    collection.add(
        ids=ids[i:batch_end],
        embeddings=embeddings[i:batch_end].tolist(),
        documents=documents[i:batch_end],
        metadatas=metadatas[i:batch_end]
    )

print(f"\n✅ Successfully indexed {len(ids):,} chunks in ChromaDB")

In [None]:
# Verify the collection
print("\nCollection Statistics:")
print(f"  Name: {collection.name}")
print(f"  Count: {collection.count():,}")
print(f"  Metadata: {collection.metadata}")

## 7. Test the Vector Store

In [None]:
# Test query
test_query = "Problems with credit card billing and unauthorized charges"

print(f"Test query: {test_query}")
print("\nGenerating query embedding...")

# Generate embedding for query
query_embedding = embedding_model.encode(test_query)

# Search
results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=5
)

print("\n" + "="*80)
print("Top 5 Most Relevant Chunks:")
print("="*80)

for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
), 1):
    print(f"\nResult {i}:")
    print(f"  Product: {metadata['product']}")
    print(f"  Issue: {metadata['issue']}")
    print(f"  Distance: {distance:.4f}")
    print(f"  Text: {doc[:200]}...")
    print("-" * 80)

In [None]:
# Test with product filtering
test_query_2 = "Issues with personal loan payments"

print(f"Test query with filter: {test_query_2}")
print("Filter: Product contains 'loan'")

query_embedding_2 = embedding_model.encode(test_query_2)

results_filtered = collection.query(
    query_embeddings=[query_embedding_2.tolist()],
    n_results=3,
    where={"product": {"$contains": "loan"}}
)

print("\n" + "="*80)
print("Top 3 Results (Personal Loan only):")
print("="*80)

for i, (doc, metadata) in enumerate(zip(
    results_filtered['documents'][0],
    results_filtered['metadatas'][0]
), 1):
    print(f"\nResult {i}:")
    print(f"  Product: {metadata['product']}")
    print(f"  Issue: {metadata['issue']}")
    print(f"  Text: {doc[:200]}...")
    print("-" * 80)

## 8. Save Metadata and Configuration

In [None]:
# Save configuration and statistics
config = {
    'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
    'embedding_dimension': 384,
    'chunk_size': 500,
    'chunk_overlap': 50,
    'sample_size': len(df_sample),
    'total_chunks': len(chunks_data),
    'collection_name': collection_name,
    'vector_store_path': str(vector_store_path),
    'products': df_sample['Product'].unique().tolist()
}

# Save config
config_path = vector_store_path / 'config.pkl'
with open(config_path, 'wb') as f:
    pickle.dump(config, f)

print("Configuration saved:")
for key, value in config.items():
    print(f"  {key}: {value}")

print(f"\n✅ Configuration saved to: {config_path}")

## 9. Summary

### Task 2 Completed Successfully! ✅

**Sampling Strategy:**
- Created a stratified sample of ~12,000 complaints
- Ensured proportional representation across all product categories
- Maintained the original distribution of products

**Chunking Approach:**
- Used `RecursiveCharacterTextSplitter` from LangChain
- Chunk size: 500 characters (balances context and embedding quality)
- Chunk overlap: 50 characters (maintains continuity)
- Generated ~[X] chunks from [Y] complaints

**Embedding Model:**
- Model: `sentence-transformers/all-MiniLM-L6-v2`
- Dimension: 384
- Lightweight (80MB) and efficient
- Optimized for semantic similarity tasks

**Vector Store:**
- Built using ChromaDB
- Persistent storage for reuse
- Metadata includes: complaint_id, product, issue, sub_issue, company, state, date
- Supports filtering and semantic search

**Next Steps:**
- Proceed to Task 3: Build RAG Core Logic
- Use the pre-built full-scale vector store for production
- Implement retrieval and generation pipeline