In [1]:
import pandas as pd

df = pd.read_csv('../data/processed/filtered_complaints.csv')
print(df.head())

  Date received                      Product  \
0    2025-06-13                  Credit card   
1    2025-06-13  Checking or savings account   
2    2025-06-12                  Credit card   
3    2025-06-12                  Credit card   
4    2025-06-09                  Credit card   

                                  Sub-product  \
0                           Store credit card   
1                            Checking account   
2  General-purpose credit card or charge card   
3  General-purpose credit card or charge card   
4  General-purpose credit card or charge card   

                                             Issue  \
0                            Getting a credit card   
1                              Managing an account   
2               Other features, terms, or problems   
3             Incorrect information on your report   
4  Problem with a purchase shown on your statement   

                                           Sub-issue  \
0        Card opened without my con

In [2]:
# Word-based chunker
def word_chunker(text, chunk_size=200, chunk_overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i+chunk_size]
        chunks.append(' '.join(chunk))
        i += chunk_size - chunk_overlap
    return chunks

# Recursive character splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

def recursive_chunker(text, chunk_size=600, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)


In [3]:
word_chunks = []
word_metadata = []

recursive_chunks = []
recursive_metadata = []

for idx, row in df.iterrows():
    text = row['cleaned_narrative']
    product = row['Product']

    # Word-based
    w_chunks = word_chunker(text)
    for chunk in w_chunks:
        word_chunks.append(chunk)
        word_metadata.append({'product': product, 'source_row': idx})

    # Recursive
    r_chunks = recursive_chunker(text)
    for chunk in r_chunks:
        recursive_chunks.append(chunk)
        recursive_metadata.append({'product': product, 'source_row': idx})

print(f"Word-based total chunks: {len(word_chunks)}")
print(f"Recursive total chunks: {len(recursive_chunks)}")

Word-based total chunks: 626628
Recursive total chunks: 946278


In [4]:
# Choose a manageable sample size
sample_size = 50000  # adjust if needed

# Word-based sample
word_chunks_sample = word_chunks[:sample_size]
word_metadata_sample = word_metadata[:sample_size]

# Recursive sample
recursive_chunks_sample = recursive_chunks[:sample_size]
recursive_metadata_sample = recursive_metadata[:sample_size]

print(f"Word-based sample: {len(word_chunks_sample)} chunks")
print(f"Recursive sample: {len(recursive_chunks_sample)} chunks")

Word-based sample: 50000 chunks
Recursive sample: 50000 chunks


In [5]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


In [8]:
import faiss
import numpy as np
import pickle

# Embed word-based sample
word_embeddings = model.encode(
    word_chunks_sample,
    batch_size=128,
    show_progress_bar=True
)
word_embeddings = np.array(word_embeddings).astype('float32')

# Build FAISS index
word_index = faiss.IndexFlatL2(word_embeddings.shape[1])
word_index.add(word_embeddings)

# Save index
faiss.write_index(word_index, '../data/processed/faiss_word_sample.index')

# Save metadata
with open('../data/processed/word_metadata_sample.pkl', 'wb') as f:
    pickle.dump(word_metadata_sample, f)

print(f"✅ Word-based sample index and metadata saved.")


Batches: 100%|██████████| 391/391 [36:36<00:00,  5.62s/it]


✅ Word-based sample index and metadata saved.


In [6]:
import faiss
import numpy as np
import pickle

# Embed recursive sample
recursive_embeddings = model.encode(
    recursive_chunks_sample,
    batch_size=128,
    show_progress_bar=True
)
recursive_embeddings = np.array(recursive_embeddings).astype('float32')

# Build FAISS index
recursive_index = faiss.IndexFlatL2(recursive_embeddings.shape[1])
recursive_index.add(recursive_embeddings)

# Save index
faiss.write_index(recursive_index, '../data/processed/faiss_recursive_sample.index')

# Save metadata
with open('../data/processed/recursive_metadata_sample.pkl', 'wb') as f:
    pickle.dump(recursive_metadata_sample, f)

print(f"✅ Recursive sample index and metadata saved.")


Batches: 100%|██████████| 391/391 [27:49<00:00,  4.27s/it]


✅ Recursive sample index and metadata saved.


In [7]:
import faiss

# Example test query
query = "Why are people unhappy with credit card charges?"
query_embedding = model.encode([query]).astype('float32')

word_index = faiss.read_index('../data/processed/faiss_word_sample.index')

# Word-based search
D_word, I_word = word_index.search(query_embedding, k=5)
print("\n🔍 Word-based top 5:")
for i in I_word[0]:
    print("-", word_chunks_sample[i][:200], "...")

recursive_index = faiss.read_index('../data/processed/faiss_recursive_sample.index')

# Recursive search
D_rec, I_rec = recursive_index.search(query_embedding, k=5)
print("\n🔍 Recursive top 5:")
for i in I_rec[0]:
    print("-", recursive_chunks_sample[i][:200], "...")



🔍 Word-based top 5:
- card on xxxxyear and told the person this charge was excessive and they refused to offer credit to my account i was refused credit or any consideration by xxxx different peopleas a xxxx xxxx xxxx xxxx ...
- by the website itself i am still being charged interest on a card i simply can not pay the balance of again i repeat this is the case of many people these are unjust and illegal practices by comenity  ...
- actions when it comes to using a credit card ...
- i have a couple of store credit cards and noticed they are now charging a fee for paper statements i feel this is something the cfpb should address as it is taking advantage of the consumer digital st ...
- a productive call and hung up on me it is predatory to add on fees before a payment is made and to not have that included in the payment when opting to pay off the entire card and it is predatory to o ...

🔍 Recursive top 5:
- it makes it confusing so that they can assess these insane rates higher than a

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# --------------------------------------------
# ✅ 1️⃣ Imports & setup
# --------------------------------------------

import faiss
import pickle
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Load device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load embedder
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Load your FAISS index & metadata — pick which one
INDEX_PATH = '../data/processed/faiss_recursive_sample.index'
META_PATH = '../data/processed/recursive_metadata_sample.pkl'
CHUNK_SOURCE = 'recursive'  # or 'word'

# Load index
index = faiss.read_index(INDEX_PATH)

# Load metadata
with open(META_PATH, 'rb') as f:
    metadata = pickle.load(f)

# Load the same chunks again for output — optional if needed
# If you have them saved in a file, load here
# For now, store them in your session:
from pathlib import Path

# If needed: you can store chunks in metadata too
# For this template we’ll just read from metadata

# Load small LLM
llm = pipeline(
    "text-generation",
    model="mistralai/Mistral-7B-Instruct-v0.2",
    device=0 if device == 'cuda' else -1,
    max_new_tokens=300
)

print("✅ LLM pipeline ready")

# --------------------------------------------
# ✅ 2️⃣ Retrieval function
# --------------------------------------------

def retrieve_top_k(query, k=5):
    query_emb = embedder.encode([query]).astype('float32')
    D, I = index.search(query_emb, k)
    results = []
    for idx in I[0]:
        
        result = {
            "chunk": recursive_chunks_sample[idx],  # Store chunk in metadata!
            "metadata": metadata[idx]
        }
        results.append(result)
    return results

# --------------------------------------------
# ✅ 3️⃣ Prompt generator
# --------------------------------------------

def generate_answer(query, top_chunks):
    context = "\n\n".join([c['chunk'] for c in top_chunks])
    prompt = f"""
You are an internal AI assistant for CrediTrust.
Use ONLY the context below to answer the question.
If the context does not answer it, say "I don't have enough information."

Context:
{context}

Question:
{query}

Answer:
"""
    output = llm(prompt)
    return output[0]['generated_text']

# --------------------------------------------
# ✅ 4️⃣ End-to-end example
# --------------------------------------------

question = "Why are people unhappy with credit card charges?"
retrieved = retrieve_top_k(question, k=5)

print("\n🔍 Top retrieved chunks:")
for r in retrieved:
    print("-", r['chunk'][:200], "...")

answer = generate_answer(question, retrieved)

print("\n✅ Final generated answer:")
print(answer)

Using device: cpu


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2.
401 Client Error. (Request ID: Root=1-686d6a1e-631c0b242f21bbfa5ad469e5;d1614da0-5508-4445-a51b-098bd64c1660)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must have access to it and be authenticated to access it. Please log in.