# Text Chunking, Embedding, and Vector Store Indexing

Importing Libraries

In [6]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv("/Users/elbethelzewdie/Downloads/rag-complaint-chatbot/rag-complaint-chatbot/data/preprocessed/filtered_complaints.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479473 entries, 0 to 479472
Data columns (total 20 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Date received                 479473 non-null  object
 1   Product                       479473 non-null  object
 2   Sub-product                   458854 non-null  object
 3   Issue                         479473 non-null  object
 4   Sub-issue                     315644 non-null  object
 5   Consumer complaint narrative  479473 non-null  object
 6   Company public response       194495 non-null  object
 7   Company                       479473 non-null  object
 8   State                         473927 non-null  object
 9   ZIP code                      479473 non-null  object
 10  Tags                          83801 non-null   object
 11  Consumer consent provided?    479473 non-null  object
 12  Submitted via                 479473 non-null  object
 13 

In [4]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_word_count,Product_category
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,card opened me fraudster received notice accou...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,Servicemember,Consent provided,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,Credit card
1,2025-06-13,Checking or savings account,Checking account,Managing an account,Deposits and withdrawals,made mistake using wellsfargo debit card depsi...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,ID,83815,,Consent provided,Web,2025-06-13,Closed with explanation,Yes,,14061897,109,Savings account
2,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,dear cfpb secured credit card citibank changed...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,,Consent provided,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,Credit card
3,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,citi reward card credit balance issued 8400 00...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,,Consent provided,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,Credit card
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b writing dispute following charge citi credit...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,Credit card


In [8]:
sample_size = 15000  # choose a number between 10k-15k

# Compute stratified proportions
stratify_col = "Product_category"

# Create the stratified sample
strat_sample, _ = train_test_split(
    df,
    train_size=sample_size,
    stratify=df[stratify_col],
    random_state=42
)

# Check the distribution to ensure proportionality
print("Stratified sample distribution:")
print(strat_sample[stratify_col].value_counts(normalize=True).round(3))


Stratified sample distribution:
Product_category
Credit card        0.411
Savings account    0.324
Money transfers    0.206
Personal loan      0.059
Name: proportion, dtype: float64


I applied proportional stratified sampling based on the product category to create a subset of 15k complaints. This approach preserves the original class distribution observed in the cleaned dataset, ensuring that the embedding space reflects real-world complaint frequencies while maintaining representation from all product categories.

In [13]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def run_chunking_experiment(df, chunk_size, chunk_overlap):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )

    chunks = df["Consumer complaint narrative"].apply(
        lambda x: splitter.split_text(x) if isinstance(x, str) else []
    )

    exploded = df.assign(chunks=chunks).explode("chunks")
    exploded = exploded[exploded["chunks"].str.strip().astype(bool)]

    return {
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "num_chunks": len(exploded),
        "avg_chunks_per_doc": len(exploded) / len(df),
        "avg_chunk_length": exploded["chunks"].str.len().mean()
    }



In [14]:
experiments = []

configs = [
    (300, 50),
    (500, 100),
    (800, 150),
]

for size, overlap in configs:
    result = run_chunking_experiment(strat_sample, size, overlap)
    experiments.append(result)

experiment_df = pd.DataFrame(experiments)
experiment_df


Unnamed: 0,chunk_size,chunk_overlap,num_chunks,avg_chunks_per_doc,avg_chunk_length
0,300,50,46847,3.123133,255.259398
1,500,100,30711,2.0474,391.109244
2,800,150,21955,1.463667,524.781508


In [17]:
# Pick one long complaint for inspection
sample_text = df.loc[
    df["Consumer complaint narrative"].str.len().idxmax(),
    "Consumer complaint narrative"
]

print("Original text length:", len(sample_text))
print(sample_text[:1000])  # preview


Original text length: 21754
consumer fincial protection bureau cfpb dc complain ria high importance high without prejudice wish practice right customer ria use organisation service seeking formal impartial investigation amicably settle dispute ria order clear myriad letter correspondence hitherto sent ria respecting complaint believe substantially strengthen case understanding taking deeper look happening case alysing relevant fact objective comprehensive fashion crucial note manipulated socially engineered coerced engage fraudulent crimil much embarrassment recognise victim investment scam complaint cfpb arisen not consider stretch imagition conduct ria commensurate legal role responsibility customer sell service look customer protect money fincial institution maintains traditiol relationship way working customer complaint process ria found communication ineffective hide conduct magement diminishes service offering client struggling adapt business offering ever changing world developm

In [18]:
chunk_configs = {
    "300_50": {"chunk_size": 300, "chunk_overlap": 50},
    "500_100": {"chunk_size": 500, "chunk_overlap": 100},
    "800_150": {"chunk_size": 800, "chunk_overlap": 150},
}

In [20]:
for name, config in chunk_configs.items():
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=config["chunk_size"],
        chunk_overlap=config["chunk_overlap"],
        length_function=len
    )

    chunks = splitter.split_text(sample_text)

    print(f"\n{'='*60}")
    print(f"Configuration: chunk_size={config['chunk_size']}, "
          f"chunk_overlap={config['chunk_overlap']}")
    print(f"Number of chunks: {len(chunks)}")

    # Show first 2 chunks only (enough for comparison)
    for i, chunk in enumerate(chunks[:3]):
        print(f"\n--- Chunk {i+1} (length={len(chunk)}) ---")
        print(chunk[:400])



Configuration: chunk_size=300, chunk_overlap=50
Number of chunks: 87

--- Chunk 1 (length=299) ---
consumer fincial protection bureau cfpb dc complain ria high importance high without prejudice wish practice right customer ria use organisation service seeking formal impartial investigation amicably settle dispute ria order clear myriad letter correspondence hitherto sent ria respecting complaint

--- Chunk 2 (length=292) ---
hitherto sent ria respecting complaint believe substantially strengthen case understanding taking deeper look happening case alysing relevant fact objective comprehensive fashion crucial note manipulated socially engineered coerced engage fraudulent crimil much embarrassment recognise victim

--- Chunk 3 (length=296) ---
crimil much embarrassment recognise victim investment scam complaint cfpb arisen not consider stretch imagition conduct ria commensurate legal role responsibility customer sell service look customer protect money fincial institution maintains trad