In [2]:
# All products with counts
product_counts = df["Product"].value_counts()

# Show top 20
print("Top 20 product types in the dataset:")
print(product_counts.head(20))


Top 20 product types in the dataset:
Product
Credit reporting or other personal consumer reports                             4834855
Credit reporting, credit repair services, or other personal consumer reports    2163857
Debt collection                                                                  799197
Mortgage                                                                         422254
Checking or savings account                                                      291178
Credit card                                                                      226686
Credit card or prepaid card                                                      206369
Money transfer, virtual currency, or money service                               145066
Credit reporting                                                                 140429
Student loan                                                                     109717
Bank account or service                                                    

In [4]:
# What products are being excluded by the filter?
TARGET_PRODUCTS = [
    "Credit card",
    "Personal loan",
    "Buy Now, Pay Later (BNPL)",
    "Savings account",
    "Money transfers"
]

all_products = set(df["Product"].dropna().unique())
included_products = set(TARGET_PRODUCTS)
excluded_products = all_products - included_products

# Print product counts of excluded ones
excluded_df = df[df["Product"].isin(excluded_products)]
excluded_counts = excluded_df["Product"].value_counts()
print("Top 10 excluded product types (with complaints):")
print(excluded_counts.head(10))


Top 10 excluded product types (with complaints):
Product
Credit reporting or other personal consumer reports                             4834855
Credit reporting, credit repair services, or other personal consumer reports    2163857
Debt collection                                                                  799197
Mortgage                                                                         422254
Checking or savings account                                                      291178
Credit card or prepaid card                                                      206369
Money transfer, virtual currency, or money service                               145066
Credit reporting                                                                 140429
Student loan                                                                     109717
Bank account or service                                                           86205
Name: count, dtype: int64


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys

# Add src/ to path for imports
import sys
sys.path.append("../src")

from preprocessing import (
    load_data,
    preprocess_complaints,
    save_cleaned_data,
    STRICT_PRODUCTS,
    EXPANDED_PRODUCTS
)
PANDED_PRODUCTS
)

# Paths
INPUT_PATH = "../data/complaints.csv"
OUTPUT_PATH = "../data/filtered_complaints.csv"

# Load full dataset
df = load_data(INPUT_PATH)
print("Initial shape:", df.shape)

# === Domain-Specific EDA === #

# 1. Inspect all products and their complaint counts
product_counts = df["Product"].value_counts()
print("Top 20 product types in the dataset:")
print(product_counts.head(20))

# 2. Identify excluded products when filtering to STRICT_PRODUCTS
all_products = set(df["Product"].dropna().unique())
excluded_products = all_products - set(STRICT_PRODUCTS)

excluded_df = df[df["Product"].isin(excluded_products)]
excluded_counts = excluded_df["Product"].value_counts()
print("\nTop 10 excluded product types (with complaints):")
print(excluded_counts.head(10))

# Visualize excluded product categories with narratives
excluded_with_narr = excluded_df[excluded_df["Consumer complaint narrative"].notna()]
plot_data = excluded_with_narr["Product"].value_counts().head(10)

plt.figure(figsize=(10, 5))
sns.barplot(x=plot_data.values, y=plot_data.index)
plt.title("Top Excluded Product Categories (with Narratives)")
plt.xlabel("Complaints with Narratives")
plt.tight_layout()
plt.show()

# === Applied EDA === #

# Narrative length distribution (all data)
df["narrative_word_count"] = df["Consumer complaint narrative"].fillna("").apply(lambda x: len(str(x).split()))
plt.figure(figsize=(8, 4))
sns.histplot(df["narrative_word_count"], bins=50, kde=True)
plt.title("Narrative Length Distribution (All Complaints)")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

# Narrative availability stats
with_narrative = df["Consumer complaint narrative"].notna().sum()
without_narrative = df.shape[0] - with_narrative
print(f"\nWith narratives: {with_narrative:,}, Without: {without_narrative:,}")

# === Integrated Preprocessing Workflow === #

# Run preprocessing on strict product list WITHOUT filtering short narratives yet
df_cleaned_strict = preprocess_complaints(
    df,
    apply_min_word_filter=False,
    filter_mode="strict"
)
print(f"\n✅ Final cleaned shape (strict filter): {df_cleaned_strict.shape}")

# Save cleaned strict dataset
save_cleaned_data(df_cleaned_strict, OUTPUT_PATH)
print(f"✅ Cleaned data saved to: {OUTPUT_PATH}")

# Optional: run preprocessing on expanded list (for comparison/testing)
df_cleaned_expanded = preprocess_complaints(
    df,
    apply_min_word_filter=False,
    filter_mode="expanded"
)
print(f"\n✅ Final cleaned shape (expanded filter): {df_cleaned_expanded.shape}")

# === Summary (to add in report) === #

print("""
Summary:
- The dataset contains many complaint product categories beyond the 5 core products.
- Excluded categories include credit reporting, debt collection, mortgage, and others with millions of complaints.
- For the challenge's business focus, we use the strict 5-product list, but an expanded list is easy to generate.
- No short narrative filtering applied yet, allowing the RAG pipeline to handle that.
- Flexible filtering allows future tuning for best RAG performance.
""")


ImportError: cannot import name 'STRICT_PRODUCTS' from 'preprocessing' (c:\Users\dagi\OneDrive\Desktop\Kifiya\week-6\creditrust-complaint-ra\notebooks\../src\preprocessing.py)