## 1. Load Raw Data

Load the raw dataset to understand its initial structure and volume.


In [1]:
if os.path.exists(RAW_DATA_PATH):
    df_raw = pd.read_csv(RAW_DATA_PATH)
    print(f"Raw data loaded. Shape: {df_raw.shape}")
    display(df_raw.head(3))
else:
    print(f"Raw data not found at {RAW_DATA_PATH}. Please ensure the file exists.")
    df_raw = pd.DataFrame()


NameError: name 'os' is not defined

## 2. Quantify Missing Narratives

A critical step is identifying how many complaints actually contain a narrative text, as this is the core input for our RAG system.


In [None]:
if not df_raw.empty:
    total_complaints = len(df_raw)
    missing_narratives = df_raw['Consumer complaint narrative'].isna().sum()
    present_narratives = total_complaints - missing_narratives
    
    print(f"Total Complaints: {total_complaints}")
    print(f"Missing Narratives: {missing_narratives} ({missing_narratives/total_complaints:.1%})")
    print(f"Usable Narratives: {present_narratives} ({present_narratives/total_complaints:.1%})")
    
    # Visualization
    plt.figure(figsize=(8, 5))
    plt.bar(['Missing Narrative', 'Has Narrative'], [missing_narratives, present_narratives], color=['red', 'green'])
    plt.title("Availability of Consumer Complaint Narratives")
    plt.ylabel("Number of Complaints")
    plt.show()


## 3. Define Preprocessing Functions

We define the logic to filter for the four target products and clean the text.
Target Products:
1. Credit card
2. Personal loan
3. Savings account
4. Money transfers


In [None]:
def clean_text(text):
    """Cleans the text narrative."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.replace("xxxx", "") # Remove redaction placeholders
    text = " ".join(text.split()) # Normalize whitespace
    return text

def process_complaints(df):
    """Filters and cleans the complaint dataset."""
    # Product Mapping
    product_map = {
        "Credit card": "Credit card",
        "Credit card or prepaid card": "Credit card",
        "Prepaid card": "Credit card",
        "Payday loan, title loan, or personal loan": "Personal loan",
        "Personal loan": "Personal loan",
        "Checking or savings account": "Savings account",
        "Savings account": "Savings account",
        "Money transfer, virtual currency, or money service": "Money transfers",
        "Money transfers": "Money transfers"
    }
    
    # Create normalized product column
    df['normalized_product'] = df['Product'].map(product_map)
    
    # Filter for target products (drop rows where map returned NaN)
    df_filtered = df.dropna(subset=['normalized_product']).copy()
    
    # Filter for non-empty narratives
    df_filtered = df_filtered.dropna(subset=['Consumer complaint narrative'])
    
    # Clean narratives
    df_filtered['cleaned_narrative'] = df_filtered['Consumer complaint narrative'].apply(clean_text)
    
    return df_filtered


## 4. Apply Filtering and Preprocessing

Apply the transformation to the raw data.


In [None]:
if not df_raw.empty:
    df_processed = process_complaints(df_raw)
    print(f"Processed Data Shape: {df_processed.shape}")
    display(df_processed.head(3))
else:
    df_processed = pd.DataFrame()


## 5. Visualize Target Product Distribution

Analyze the distribution of complaints across the four specific product categories in the final dataset.


In [None]:
if not df_processed.empty:
    product_counts = df_processed['normalized_product'].value_counts()
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=product_counts.index, y=product_counts.values, palette="viridis")
    plt.title("Distribution of Complaints by Target Product")
    plt.xlabel("Product Category")
    plt.ylabel("Number of Complaints")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print(product_counts)


## 6. Analyze Narrative Length Distributions

Analyze the length of the complaints to understand the input size for our embedding model.


In [None]:
if not df_processed.empty:
    # Calculate word count
    df_processed['word_count'] = df_processed['cleaned_narrative'].apply(lambda x: len(x.split()))
    
    print("Narrative Length Statistics:")
    print(df_processed['word_count'].describe())
    
    plt.figure(figsize=(12, 6))
    sns.histplot(df_processed['word_count'], bins=50, kde=True, color='blue')
    plt.title("Distribution of Complaint Narrative Lengths (Word Count)")
    plt.xlabel("Word Count")
    plt.ylabel("Frequency")
    plt.show()


## 7. Export Processed Data

Save the final cleaned dataset to the specified path.


In [None]:
if not df_processed.empty:
    os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)
    df_processed.to_csv(PROCESSED_DATA_PATH, index=False)
    print(f"Successfully saved processed data to {PROCESSED_DATA_PATH}")
