In [8]:
import pandas as pd
import os
import gc
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm

# --- CONFIGURATION (STRICTLY "Smart Complaint Prioritizer") ---
PROJECT_PATH = "/content/drive/My Drive/Smart Complaint Prioritizer"
RAW_FILE_PATH = f"{PROJECT_PATH}/data/raw/complaints.csv"
PROCESSED_FILE_PATH = f"{PROJECT_PATH}/data/processed/complaints_subset.csv"
FINAL_OUTPUT_PATH = f"{PROJECT_PATH}/data/processed/complaints_with_features.csv"

# --- PART 1: DATA INGESTION (Generate the missing file) ---
if not os.path.exists(PROCESSED_FILE_PATH):
    print("⚠️ Processed file not found. Generating it now from Raw data...")

    if not os.path.exists(RAW_FILE_PATH):
        print(f"❌ CRITICAL ERROR: Raw file not found at {RAW_FILE_PATH}")
        print("Please check that you uploaded 'complaints.csv' to the 'data/raw' folder in Google Drive.")
    else:
        # Define columns to keep and rename
        COLUMN_MAPPING = {
            'Consumer complaint narrative': 'consumer_complaint_narrative',
            'Issue': 'issue',
            'Product': 'product',
            'Company': 'company',
            'Submitted via': 'submitted_via',
            'Date received': 'date_received',
            'Company public response': 'company_public_response',
            'Company response to consumer': 'company_response_to_consumer'
        }

        # Chunk processing
        chunk_iterator = pd.read_csv(
            RAW_FILE_PATH,
            usecols=COLUMN_MAPPING.keys(),
            chunksize=100000,
            low_memory=False
        )

        first_chunk = True
        total_rows = 0

        for chunk in chunk_iterator:
            cleaned = chunk.dropna(subset=['Consumer complaint narrative'])
            if cleaned.empty: continue

            cleaned = cleaned.rename(columns=COLUMN_MAPPING)

            mode = 'w' if first_chunk else 'a'
            header = True if first_chunk else False
            cleaned.to_csv(PROCESSED_FILE_PATH, mode=mode, header=header, index=False)

            total_rows += len(cleaned)
            first_chunk = False
            del chunk, cleaned
            gc.collect()

        print(f"✅ Data Ingestion Complete. Saved {total_rows} rows.")

else:
    print("✅ Processed file already exists. Proceeding...")

# --- PART 2: FEATURE ENGINEERING (Priority Logic) ---
print("\nLoading data for Feature Engineering...")
df = pd.read_csv(PROCESSED_FILE_PATH)
df = df.dropna(subset=['consumer_complaint_narrative']) # Safety check

# 1. Clean Text
tqdm.pandas(desc="Cleaning Text")
def clean_text(text):
    if pd.isna(text): return ""
    text = str(text).lower()
    text = re.sub(r'x{2,}', '', text) # Remove xxxx
    text = re.sub(r'[^a-z\s]', '', text) # Keep only letters
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_narrative'] = df['consumer_complaint_narrative'].progress_apply(clean_text)

# 2. Priority Logic
analyzer = SentimentIntensityAnalyzer()
HIGH_KEYWORDS = ['fraud', 'theft', 'illegal', 'scam', 'threaten', 'harass', 'lawsuit', 'identit', 'arrest']
MEDIUM_KEYWORDS = ['fee', 'charge', 'late', 'interest', 'mistake', 'error', 'billing', 'credit report']

def get_priority_score(row):
    text = row['cleaned_narrative']
    vs = analyzer.polarity_scores(text)
    sentiment_score = vs['compound']

    has_high = any(word in text for word in HIGH_KEYWORDS)
    has_medium = any(word in text for word in MEDIUM_KEYWORDS)

    if has_high or sentiment_score < -0.70:
        return "High"
    elif has_medium or sentiment_score < -0.30:
        return "Medium"
    else:
        return "Low"

print("\nCalculating Priority Scores (approx 5-10 mins)...")
tqdm.pandas(desc="Scoring Priority")
df['priority'] = df.progress_apply(get_priority_score, axis=1)

# --- PART 3: SAVE ---
df.to_csv(FINAL_OUTPUT_PATH, index=False)
print(f"\n✅ SUCCESS! Feature-rich data saved to: {FINAL_OUTPUT_PATH}")
print("\n--- FINAL PRIORITY DISTRIBUTION ---")
print(df['priority'].value_counts())

✅ Processed file already exists. Proceeding...

Loading data for Feature Engineering...


Cleaning Text: 100%|██████████| 383564/383564 [00:52<00:00, 7285.15it/s]



Calculating Priority Scores (approx 5-10 mins)...


Scoring Priority: 100%|██████████| 383564/383564 [16:31<00:00, 386.70it/s]



✅ SUCCESS! Feature-rich data saved to: /content/drive/My Drive/Smart Complaint Prioritizer/data/processed/complaints_with_features.csv

--- FINAL PRIORITY DISTRIBUTION ---
priority
Medium    166193
High      152774
Low        64597
Name: count, dtype: int64
