In [1]:
# --- TASK 2: STEP 1 - Stratified Sampling ---

import pandas as pd
import os

# Load the cleaned dataset from Task 1
df = pd.read_csv('../data/filtered_complaints.csv')

def create_stratified_sample(df, target_total=15000):
    # Identify our categories
    categories = df['Category'].unique()
    print(f"Categories found: {categories}")
    
    # We will aim for a balanced distribution where possible
    # but since some categories are very small, we take all of them.
    
    small_cats = ['Money Transfer', 'Personal Loan']
    df_small = df[df['Category'].isin(small_cats)]
    
    # Calculate how many slots are left for the big categories
    remaining_slots = target_total - len(df_small)
    large_cats = [c for c in categories if c not in small_cats]
    slots_per_large_cat = remaining_slots // len(large_cats)
    
    # Sample the large categories
    sampled_list = [df_small]
    for cat in large_cats:
        cat_df = df[df['Category'] == cat]
        sampled_list.append(cat_df.sample(n=slots_per_large_cat, random_state=42))
    
    # Combine and shuffle
    df_final = pd.concat(sampled_list).sample(frac=1, random_state=42).reset_index(drop=True)
    return df_final

# Execute sampling
df_sample = create_stratified_sample(df)

# Show results for verification
print("\n--- Stratified Sample Distribution ---")
print(df_sample['Category'].value_counts())
print(f"\nTotal Sample Size: {len(df_sample)}")

# Create the processed directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save the sample
df_sample.to_csv('../data/processed/sampled_complaints.csv', index=False)
print("\nSample saved to data/processed/sampled_complaints.csv")

Categories found: ['Credit Card' 'Savings Account' 'Money Transfer' 'Personal Loan']

--- Stratified Sample Distribution ---
Category
Credit Card        7036
Savings Account    7036
Personal Loan       811
Money Transfer      116
Name: count, dtype: int64

Total Sample Size: 14999

Sample saved to data/processed/sampled_complaints.csv


In [4]:
# --- TASK 2: STEP 2 - Text Chunking ---

# Updated Import for current LangChain versions
from langchain_text_splitters import RecursiveCharacterTextSplitter
import pandas as pd

# 1. Initialize the splitter
# chunk_size: ~100 words (600 characters)
# chunk_overlap: to keep context across splits
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ".", " ", ""]
)

# 2. Function to process our dataframe into chunks
def chunk_data(df):
    chunked_records = []
    
    for idx, row in df.iterrows():
        # Split the cleaned narrative
        chunks = text_splitter.split_text(row['cleaned_narrative'])
        
        for i, chunk in enumerate(chunks):
            chunked_records.append({
                'chunk_id': f"{idx}_{i}",       # Unique ID for each chunk
                'complaint_id': idx,            # Trace back to original row index
                'category': row['Category'],    # Metadata: Product category
                'text_chunk': chunk             # The actual text to embed
            })
            
    return pd.DataFrame(chunked_records)

# 3. Create the chunked dataframe
print(f"Original records: {len(df_sample)}")
df_chunks = chunk_data(df_sample)

print(f"Total chunks created: {len(df_chunks)}")
print("\n--- Chunk Preview ---")
print(df_chunks[['category', 'text_chunk']].head())

Original records: 14999
Total chunks created: 16734

--- Chunk Preview ---
        category                                         text_chunk
0    Credit Card  my information was heisted and this accounts a...
1    Credit Card                            i signed up for an card
2    Credit Card  there were a multitude of fraudulent charges m...
3  Personal Loan  on statementno date on when maild payment due ...
4    Credit Card  see the attached documents i want the bureau t...


In [5]:
# --- TASK 2: STEP 3 - Loading the Embedding Model ---

from sentence_transformers import SentenceTransformer

# Load the model (this will download it on the first run, ~22MB)
print("Downloading and loading the 'all-MiniLM-L6-v2' model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Test with a single chunk to see the vector
test_text = df_chunks['text_chunk'].iloc[0]
test_vector = model.encode(test_text)

print("\n--- Model Loaded Successfully ---")
print(f"Model Name: all-MiniLM-L6-v2")
print(f"Vector Dimensions: {len(test_vector)}")
print(f"Sample Vector (first 5 values): {test_vector[:5]}")

Downloading and loading the 'all-MiniLM-L6-v2' model...

--- Model Loaded Successfully ---
Model Name: all-MiniLM-L6-v2
Vector Dimensions: 384
Sample Vector (first 5 values): [-0.00969655  0.03044972 -0.10841177 -0.01133379  0.01064291]


In [7]:
# --- TASK 2: STEP 4 - Vector Store Indexing (Batch Fixed) ---

import chromadb

# 1. Initialize the Persistent Client
client = chromadb.PersistentClient(path="../vector_store")
collection = client.get_or_create_collection(name="bank_complaints")

# 2. Define the batch size based on your error message
BATCH_SIZE = 5000 
total_chunks = len(df_chunks)

print(f"Adding {total_chunks} chunks in batches of {BATCH_SIZE}...")

# 3. Loop through the data in batches
for i in range(0, total_chunks, BATCH_SIZE):
    # Slice the data for the current batch
    batch_df = df_chunks.iloc[i : i + BATCH_SIZE]
    
    # Prepare batch data
    batch_ids = batch_df['chunk_id'].astype(str).tolist()
    batch_docs = batch_df['text_chunk'].tolist()
    batch_metadatas = batch_df[['complaint_id', 'category']].to_dict(orient='records')
    
    # Generate embeddings for the current batch
    print(f"Encoding and adding batch {i//BATCH_SIZE + 1}...")
    batch_embeddings = model.encode(batch_docs, show_progress_bar=False).tolist()
    
    # Add the batch to the collection
    collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings,
        metadatas=batch_metadatas,
        documents=batch_docs
    )

print(f"\nSuccessfully stored {collection.count()} vectors in 'vector_store/'.")

Adding 16734 chunks in batches of 5000...
Encoding and adding batch 1...
Encoding and adding batch 2...
Encoding and adding batch 3...
Encoding and adding batch 4...

Successfully stored 16734 vectors in 'vector_store/'.
