In [1]:
# 01_data_cleaning.ipynb

import pandas as pd
import os

# 1. Load raw data
raw_path = os.path.abspath('../data/raw/consumer_spending.csv')
print(f"Loading raw data from: {raw_path}")

df = pd.read_csv(raw_path)

# 2. Inspect first few rows and columns
print("First five rows of the raw data:")
print(df.head())
print("\nColumns:", df.columns.tolist())
print(f"Data shape: {df.shape}")

# 3. Data Cleaning Steps

# Drop rows with missing spend_amount
df_clean = df.dropna(subset=['spend_amount']).copy()

# Convert date to datetime
df_clean['date'] = pd.to_datetime(df_clean['date'], errors='coerce')

# Remove rows with invalid dates
df_clean = df_clean.dropna(subset=['date'])

# Convert spend_amount to float (handle any commas or bad entries)
df_clean['spend_amount'] = pd.to_numeric(df_clean['spend_amount'], errors='coerce')

# Remove rows with invalid spend_amounts
df_clean = df_clean.dropna(subset=['spend_amount'])

# Optional: Reset index
df_clean = df_clean.reset_index(drop=True)

# 4. Save cleaned data
output_path = os.path.abspath('../data/processed/cleaned_spending.csv')
print(f"\nSaving cleaned data to: {output_path}")

# Ensure output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

df_clean.to_csv(output_path, index=False)

# 5. Final Check
print(f"\nCleaned data shape: {df_clean.shape}")
print("First five rows of the cleaned data:")
print(df_clean.head())



Loading raw data from: d:\VSC\Consumer-Spending-Dashboard\data\raw\consumer_spending.csv
First five rows of the raw data:
         date       category  spend_amount     location age_group  \
0  2022-01-01      Groceries        125.50       Dallas     25-34   
1  2022-01-01  Entertainment         55.00       Austin     18-24   
2  2022-01-02         Travel        200.00      Houston     35-44   
3  2022-01-03      Groceries        142.25       Dallas     45-54   
4  2022-01-03      Utilities         90.00  San Antonio     55-64   

  payment_method  
0          Debit  
1         Credit  
2         Credit  
3           Cash  
4          Debit  

Columns: ['date', 'category', 'spend_amount', 'location', 'age_group', 'payment_method']
Data shape: (216, 6)

Saving cleaned data to: d:\VSC\Consumer-Spending-Dashboard\data\processed\cleaned_spending.csv

Cleaned data shape: (215, 6)
First five rows of the cleaned data:
        date       category  spend_amount     location age_group  \
0 2022-