In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
file_path = "/content/drive/My Drive/Smart Complaint Prioritizer/data/raw/complaints.csv"

if os.path.exists(file_path):
    print("✅ Success! Colab found the file.")
else:
    print("❌ Error: File not found. Check your folder names in Google Drive.")

✅ Success! Colab found the file.


In [None]:
import pandas as pd
import os
import gc

PROJECT_PATH = "/content/drive/My Drive/Smart Complaint Prioritizer"
RAW_DATA_FILE = "complaints.csv"
RAW_PATH = f"{PROJECT_PATH}/data/raw/{RAW_DATA_FILE}"
PROCESSED_PATH = f"{PROJECT_PATH}/data/processed/complaints_subset.csv"

COLUMN_MAPPING = {
    'Consumer complaint narrative': 'consumer_complaint_narrative',
    'Issue': 'issue',
    'Product': 'product',
    'Company': 'company',
    'Submitted via': 'submitted_via',
    'Date received': 'date_received',
    'Company public response': 'company_public_response',
    'Company response to consumer': 'company_response_to_consumer'
}

def process_and_save_data(input_path, output_path, chunk_size=100000):
    print(f"Starting ingestion from: {input_path}")

    if not os.path.exists(input_path):
        print(f"❌ Error: File not found at {input_path}")
        return

    first_chunk = True
    total_rows = 0

    chunk_iterator = pd.read_csv(
        input_path,
        usecols=COLUMN_MAPPING.keys(),
        chunksize=chunk_size,
        low_memory=False
    )

    for i, chunk in enumerate(chunk_iterator):
        cleaned_chunk = chunk.dropna(subset=['Consumer complaint narrative'])

        if cleaned_chunk.empty:
            continue

        cleaned_chunk = cleaned_chunk.rename(columns=COLUMN_MAPPING)

        mode = 'w' if first_chunk else 'a'
        header = True if first_chunk else False

        cleaned_chunk.to_csv(output_path, mode=mode, header=header, index=False)

        total_rows += len(cleaned_chunk)
        first_chunk = False

        print(f"Processed Chunk {i+1}: Saved {len(cleaned_chunk)} rows. Total so far: {total_rows}")

        del chunk
        del cleaned_chunk
        gc.collect()

    print(f"\n✅ SUCCESS! Saved {total_rows} clean rows to: {output_path}")

process_and_save_data(RAW_PATH, PROCESSED_PATH)

Starting ingestion from: /content/drive/My Drive/Smart Complaint Prioritizer/data/raw/complaints.csv
Processed Chunk 1: Saved 23908 rows. Total so far: 23908
Processed Chunk 2: Saved 45541 rows. Total so far: 69449
Processed Chunk 3: Saved 45528 rows. Total so far: 114977
Processed Chunk 4: Saved 48298 rows. Total so far: 163275
Processed Chunk 5: Saved 49081 rows. Total so far: 212356
Processed Chunk 6: Saved 44545 rows. Total so far: 256901
Processed Chunk 7: Saved 41071 rows. Total so far: 297972
Processed Chunk 8: Saved 39677 rows. Total so far: 337649
Processed Chunk 9: Saved 41104 rows. Total so far: 378753
Processed Chunk 10: Saved 4811 rows. Total so far: 383564

✅ SUCCESS! Saved 383564 clean rows to: /content/drive/My Drive/Smart Complaint Prioritizer/data/processed/complaints_subset.csv


In [None]:
# Check the new file
df = pd.read_csv(PROCESSED_PATH)
print(f"New Dataset Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
display(df.head(3))

New Dataset Shape: (383564, 8)
Columns: ['date_received', 'product', 'issue', 'consumer_complaint_narrative', 'company_public_response', 'company', 'submitted_via', 'company_response_to_consumer']


Unnamed: 0,date_received,product,issue,consumer_complaint_narrative,company_public_response,company,submitted_via,company_response_to_consumer
0,03/23/2019,"Credit reporting, credit repair services, or o...",Incorrect information on your report,The Summer of XX/XX/2018 I was denied a mortga...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",Web,Closed with explanation
1,03/22/2019,"Credit reporting, credit repair services, or o...",Incorrect information on your report,There are many mistakes appear in my report wi...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",Web,Closed with explanation
2,03/22/2019,"Credit reporting, credit repair services, or o...",Incorrect information on your report,There are many mistakes appear in my report wi...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",Web,Closed with explanation
