Importing dependencies

In [1]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

client = chromadb.PersistentClient(path="vector_store/chroma_db")

collection = client.get_collection(name="complaints_sample")

print(f"Loaded sample collection with {collection.count():,} vectors")

  from .autonotebook import tqdm as notebook_tqdm


Loaded sample collection with 13,000 vectors


In [2]:
def retrieve(question, k=5, product_filter=None):
    where = {"product_category": product_filter} if product_filter else None
    
    results = collection.query(
        query_texts=[question],
        n_results=k,
        include=["documents", "metadatas", "distances", "embeddings"]
    )
    
    context_chunks = []
    sources = []
    
    for doc, meta, dist in zip(results['documents'][0], results['metadatas'][0], results['distances'][0]):
        context_chunks.append(doc)
        sources.append(f"Complaint {meta['complaint_id']} ({meta['product_category']}) - Chunk {meta['chunk_index']+1}/{meta['total_chunks']}")
    
    context = "\n\n".join(context_chunks)
    
    print(f"Retrieved {k} chunks (average distance: {sum(results['distances'][0])/k:.4f})\n")
    for s in sources:
        print(f"- {s}")
    
    return context, sources

# Test
context, sources = retrieve("Why are customers unhappy with credit card late fees?", k=6, product_filter="Credit Cards")

Retrieved 6 chunks (average distance: 0.2964)

- Complaint 13085699 (Credit Cards) - Chunk 3/3
- Complaint 11699927 (Credit Cards) - Chunk 2/2
- Complaint 11348756 (Credit Cards) - Chunk 10/13
- Complaint 12303728 (Credit Cards) - Chunk 10/10
- Complaint 11348756 (Credit Cards) - Chunk 2/13
- Complaint 12380325 (Credit Cards) - Chunk 2/2


In [3]:
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline

print("Loading tiny GPT-2 – instant (seconds)...")

llm = HuggingFacePipeline.from_model_id(
    model_id="gpt2",
    task="text-generation",
    pipeline_kwargs={
        "max_new_tokens": 400,
        "temperature": 0.1,
        "repetition_penalty": 1.1,
        "do_sample": True,
    },
    device=-1  # Force CPU
)

print("GPT-2 loaded successfully! Ready for testing.")

Loading tiny GPT-2 – instant (seconds)...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


GPT-2 loaded successfully! Ready for testing.


Setup chromaDB Collection for Full Data

In [4]:
from langchain_core.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template(
    """You are a financial analyst assistant for CrediTrust Financial. Your task is to answer questions about customer complaints using ONLY the provided context. 

Be concise, insightful, and evidence-based. Cite specific examples from the context. If the context does not contain enough information to answer the question, say: "I don't have enough information from the complaints to answer this."

Context:
{context}

Question: {question}

Answer:"""
)

In [9]:
def rag_answer(question, k=6, product_filter=None):
    context, sources = retrieve(question, k=k, product_filter=product_filter)
    
    prompt = prompt_template.format(context=context, question=question)
    
    # Generate answer with GPT-2
    response = llm.invoke(prompt)
    
    # Clean the response a bit
    answer = response.strip()
    
    print("\n" + "="*80)
    print("QUESTION:", question)
    print("="*80)
    print("ANSWER:\n", answer)
    print("="*80)
    print("SOURCES:")
    for i, s in enumerate(sources, 1):
        print(f"{i}. {s}")
    print("="*80)
    
    return answer, sources

Batch Load and Add(Memory-Safe)

In [10]:
rag_answer(
    question="Why are customers unhappy with credit card late fees?",
    k=6,
    product_filter="Credit Cards"
)

Retrieved 6 chunks (average distance: 0.2964)

- Complaint 13085699 (Credit Cards) - Chunk 3/3
- Complaint 11699927 (Credit Cards) - Chunk 2/2
- Complaint 11348756 (Credit Cards) - Chunk 10/13
- Complaint 12303728 (Credit Cards) - Chunk 10/10
- Complaint 11348756 (Credit Cards) - Chunk 2/13
- Complaint 12380325 (Credit Cards) - Chunk 2/2

QUESTION: Why are customers unhappy with credit card late fees?
ANSWER:
 You are a financial analyst assistant for CrediTrust Financial. Your task is to answer questions about customer complaints using ONLY the provided context. 

Be concise, insightful, and evidence-based. Cite specific examples from the context. If the context does not contain enough information to answer the question, say: "I don't have enough information from the complaints to answer this."

Context:
is a deceptive practice. it appears that the sole purpose of it is to charge more late fees wrongfully and illicitly. as the consumer financial protection bureau ( cfpb ), it is exact

('You are a financial analyst assistant for CrediTrust Financial. Your task is to answer questions about customer complaints using ONLY the provided context. \n\nBe concise, insightful, and evidence-based. Cite specific examples from the context. If the context does not contain enough information to answer the question, say: "I don\'t have enough information from the complaints to answer this."\n\nContext:\nis a deceptive practice. it appears that the sole purpose of it is to charge more late fees wrongfully and illicitly. as the consumer financial protection bureau ( cfpb ), it is exactly within your purview to stop this kind of deceptive practices.\n\nrefused to help get the late fees waved. i pay my credit cards in full every month, this was a comenity bank error and i should not have to pay the late fees. this was all over a {$19.00} purchase. now they want me to pay {$80.00} in late fees.\n\nprovisions enforced by the cfpb, and stated to that, according to the card act, creditors 

In [None]:
import os
os.environ['ARROW_PRE_0_15_IPC_FORMAT'] = '1'  # Allows legacy extension types
os.environ['PYARROW_IGNORE_TIMEZONE'] = '1'  # Optional, avoids timezone warnings

print("Legacy Arrow IPC format enabled")

Legacy Arrow IPC format enabled


Inspect the Parquet File

In [None]:
import pandas as pd

# Adjust the path if your file is in a different location
parquet_path = '../data/prebuilt/complaint_embeddings-001.parquet'  # Change if needed, e.g., 'data/raw/...' or just the filename

# Read only the first 10 rows to inspect structure (safe for memory)
df_preview = pd.read_parquet(parquet_path, engine='pyarrow')

print("Parquet file loaded successfully for preview")
print(f"Shape: {df_preview.shape}")  # (rows, columns)
print("\nColumns:")
print(df_preview.columns.tolist())

print("\nData types:")
print(df_preview.dtypes)

print("\nFirst 2 rows sample:")
print(df_preview.head(2))

# Show one full example of key columns
print("\nExample of first row key fields:")
print("Text chunk:", df_preview.iloc[0]['text'][:500] + "..." if 'text' in df_preview.columns else "No 'text' column")
if 'embedding' in df_preview.columns:
    print("Embedding shape:", len(df_preview.iloc[0]['embedding']))
if 'metadata' in df_preview.columns:
    print("Metadata:", df_preview.iloc[0]['metadata'])

ArrowMemoryError: realloc of size 251658240 failed