In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from pathlib import Path

# Paths
RAW_DATA_PATH = Path("../data/raw/complaints.csv")  # Adjust if needed
PROCESSED_PATH = Path("../data/processed")
FIGURES_PATH = PROCESSED_PATH / "figures"
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)
FIGURES_PATH.mkdir(parents=True, exist_ok=True)

# Plot style
sns.set_theme(style="whitegrid")

ModuleNotFoundError: No module named 'seaborn'

In [1]:
# Load data (may take a minute due to size)
df = pd.read_csv(RAW_DATA_PATH, low_memory=False, parse_dates=["Date received"])

print(f"Full dataset shape: {df.shape}")
df.head()

NameError: name 'pd' is not defined

In [None]:
# Key columns
narrative_col = "Consumer complaint narrative"
product_col = "Product"

# Distribution of complaints across Products
product_counts = df[product_col].value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(x=product_coun                                                                                                                                                                      ts.index, y=product_counts.values, color="#2b8cbe")
plt.title("Complaints by Product (Full Dataset)")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(FIGURES_PATH / "complaints_by_product.png", dpi=200)
plt.show()

print(product_counts.head(10))

In [None]:
# Complaints with/without narratives
df["has_narrative"] = df[narrative_col].notna() & (df[narrative_col].str.strip() != "")
print(f"Total complaints: {len(df)}")
print(f"With narrative: {df['has_narrative'].sum()}")
print(f"Without narrative: {len(df) - df['has_narrative'].sum()}")

# Narrative length (word count)
df["narrative_word_count"] = df[narrative_col].fillna("").apply(lambda x: len(x.split()))
plt.figure(figsize=(10, 5))
sns.histplot(df[df["has_narrative"]]["narrative_word_count"], bins=100, kde=True, color="#1b9e77")
plt.title("Distribution of Narrative Word Counts")
plt.xlabel("Word Count")
plt.xlim(0, 1000)  # Zoom in on main distribution
plt.tight_layout()
plt.savefig(FIGURES_PATH / "narrative_word_counts.png", dpi=200)
plt.show()

print(df["narrative_word_count"].describe())

In [None]:
# Filter to relevant products (CFPB product values)
relevant_products = [
    "Credit card or prepaid card",
    "Payday loan, title loan, or personal loan",
    "Checking or savings account",
    "Money transfer, virtual currency, or money service"
]

df_filtered = df[df[product_col].isin(relevant_products)].copy()

# Optional refinement: uncomment to keep only savings for that combined category
# df_filtered = df_filtered[~((df_filtered[product_col] == 'Checking or savings account') & (df_filtered['Sub-product'] == 'Checking account'))]

# Keep only rows with narrative
df_filtered = df_filtered[df_filtered["has_narrative"]].copy()

print(f"Filtered dataset shape: {df_filtered.shape}")
print(df_filtered[product_col].value_counts())

In [None]:
# Text cleaning
def clean_narrative(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r' x{2,}', ' ', text)  # Remove XXXX redactions with spacing
    text = re.sub(r'x{2,}', '', text)
    text = re.sub(r'\d{5,}', '', text)  # Remove long number strings
    boilerplates = [
        "i am filing this complaint because",
        "i am writing to complain about"
    ]
    for phrase in boilerplates:
        text = text.replace(phrase, "")
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_filtered["clean_narrative"] = df_filtered[narrative_col].apply(clean_narrative)

# Drop rows where cleaned narrative is too short (<10 words)
df_filtered = df_filtered[df_filtered["clean_narrative"].apply(lambda x: len(x.split()) >= 10)]

print(f"After cleaning/length filter: {df_filtered.shape}")
df_filtered[product_col].value_counts()

In [None]:
# Save filtered dataset
output_path = PROCESSED_PATH / "filtered_complaints.csv"
df_filtered.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

# Quick summary helpers for reporting
summary = {
    "full_shape": df.shape,
    "with_narrative_pct": round(100 * df['has_narrative'].mean(), 2),
    "median_words": int(df[df['has_narrative']]["narrative_word_count"].median()),
    "filtered_shape": df_filtered.shape,
    "filtered_counts": df_filtered[product_col].value_counts().to_dict()
}
summary

## Reporting Notes
- Insert the numeric values from the last cell into your write-up (e.g., total complaints, % with narratives, median words).
- Include the saved plots from data/processed/figures: complaints_by_product.png and narrative_word_counts.png.
- Mention filtered counts per product to flag any class imbalance for downstream modeling.