importing dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# For nice display
pd.set_option('display.max_columns', None)
%matplotlib inline

Define the products I want

In [4]:
# The exact product names in the dataset (case-sensitive!)
target_products = [
    'Credit card',                  # Older complaints
    'Credit card or prepaid card',  # Newer category that includes credit cards
    'Checking or savings account',  # Covers savings accounts
    'Bank account or service',      # Older category for savings/checking
    'Money transfer, virtual currency, or money service',
    'Money transfers',              # Older
    'Consumer Loan',                # Includes personal loans
    'Payday loan, title loan, or personal loan'  # Newer personal loans
]

print("Target product categories defined.")

Target product categories defined.


Chunked Loading, Filtering and Basic Stats

In [6]:
# Path to your raw CSV (adjust if needed)
csv_path = '../data/raw/complaints.csv'  # or 'data/raw/complaints_full.csv'

# Columns we actually need (this saves HUGE memory)
usecols = [
    'Product', 
    'Sub-product', 
    'Issue', 
    'Sub-issue', 
    'Consumer complaint narrative', 
    'Company', 
    'State', 
    'Date received',
    'Complaint ID'
]

# For efficient types (optional but helps)
dtype = {
    'Product': 'category',
    'Sub-product': 'category',
    'Issue': 'category',
    'Sub-issue': 'category',
    'Company': 'category',
    'State': 'category',
    'Complaint ID': 'str'
}

chunksize = 100_000  # Adjust down to 50_000 if still memory issues
filtered_chunks = []

print("Starting chunked processing...")

for i, chunk in enumerate(pd.read_csv(csv_path, usecols=usecols, dtype=dtype, chunksize=chunksize, low_memory=False)):
    # Drop rows without narrative early
    chunk = chunk[chunk['Consumer complaint narrative'].notna()]
    chunk = chunk[chunk['Consumer complaint narrative'].str.strip() != '']
    
    # Filter to our target products (broad match)
    mask = chunk['Product'].isin(target_products)
    # Also catch if personal loans or savings are in sub-product
    if 'personal loan' in chunk.columns:
        mask |= chunk['Sub-product'].str.contains('personal loan', case=False, na=False)
    if 'savings' in chunk.columns:
        mask |= chunk['Sub-product'].str.contains('savings', case=False, na=False)
    
    filtered = chunk[mask].copy()
    
    if len(filtered) > 0:
        filtered_chunks.append(filtered)
    
    if (i + 1) % 5 == 0:
        print(f"Processed {(i+1)*chunksize:,} rows | Filtered so far: {sum(len(c) for c in filtered_chunks):,}")

# Combine all filtered chunks
df_filtered = pd.concat(filtered_chunks, ignore_index=True)

print(f"\nFinal filtered dataset shape: {df_filtered.shape}")
df_filtered.head()

Starting chunked processing...
Processed 500,000 rows | Filtered so far: 3,015
Processed 1,000,000 rows | Filtered so far: 19,308
Processed 1,500,000 rows | Filtered so far: 31,825
Processed 2,000,000 rows | Filtered so far: 58,722
Processed 2,500,000 rows | Filtered so far: 82,010
Processed 3,000,000 rows | Filtered so far: 95,532
Processed 3,500,000 rows | Filtered so far: 114,500
Processed 4,000,000 rows | Filtered so far: 140,186
Processed 4,500,000 rows | Filtered so far: 168,256
Processed 5,000,000 rows | Filtered so far: 199,175
Processed 5,500,000 rows | Filtered so far: 231,690
Processed 6,000,000 rows | Filtered so far: 267,155
Processed 6,500,000 rows | Filtered so far: 301,578
Processed 7,000,000 rows | Filtered so far: 337,133
Processed 7,500,000 rows | Filtered so far: 361,508
Processed 8,000,000 rows | Filtered so far: 394,011
Processed 8,500,000 rows | Filtered so far: 418,600
Processed 9,000,000 rows | Filtered so far: 438,340
Processed 9,500,000 rows | Filtered so far

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company,State,Complaint ID
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,"CITIBANK, N.A.",TX,14069121
1,2025-06-13,Checking or savings account,Checking account,Managing an account,Deposits and withdrawals,I made the mistake of using my wellsfargo debi...,WELLS FARGO & COMPANY,ID,14061897
2,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...","CITIBANK, N.A.",NY,14047085
3,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,"CITIBANK, N.A.",IL,14040217
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,"CITIBANK, N.A.",TX,13968411


Load the csv file

In [2]:
df = pd.read_csv('../data/raw/complaints.csv')

print(f"Dataset shape: {df.shape}")
df.head()

df.columns.tolist()

MemoryError: Unable to allocate 256. KiB for an array with shape (32768,) and data type int64