# Data Cleaning and Exploration

This notebook performs initial data cleaning and exploration checks on the Amazon Sale Report dataset.

In [None]:
import pandas as pd
import numpy as np

## 1. Load Data

In [None]:
df = pd.read_csv('../data/Amazon Sale Report.csv', low_memory=False)
df.head()

## 2. Check Columns
List all columns in the dataset.

In [None]:
print("Columns:")
print(df.columns)

## 3. Check Data Types
Inspect the data types of each column.

In [None]:
print("Data Types:")
print(df.dtypes)

## 4. Check Missing Values
Identify columns with missing values and their counts.

In [None]:
print("Missing Values:")
print(df.isnull().sum())

## 5. Check Duplicates
Check for and count duplicate rows.

In [None]:
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

## 6. Fix Column Names
Rename columns to snake_case for consistency (lowercase, spaces to underscores).

In [None]:
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
print("New column names:")
print(df.columns)

## 7. Convert Dates
Convert the 'date' column to datetime objects.

In [None]:
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    print("Date column converted to datetime.")
    print(df['date'].dtype)
else:
    print("Date column not found or named differently.")

## 8. Handle Missing Values
Drop columns with excessive missing values and fill others.

In [None]:
# Drop columns with more than 50% missing values
threshold = len(df) * 0.5
df = df.dropna(thresh=threshold, axis=1)

# Fill missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna('Unknown', inplace=True)
    else:
        df[col].fillna(0, inplace=True)

print("Missing values after handling:")
print(df.isnull().sum())

## 9. Remove Duplicates
Drop duplicate rows from the dataset.

In [None]:
initial_rows = len(df)
df.drop_duplicates(inplace=True)
final_rows = len(df)
print(f"Removed {initial_rows - final_rows} duplicate rows.")

## 10. Fix Inconsistent Categories
Standardize string columns by trimming whitespace and capitalizing.

In [None]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.strip().str.title()

print("String columns standardized.")
df.head()

## 11. Export Cleaned Data
Save the cleaned dataframe to a CSV file.

In [None]:
df.to_csv('../data/cleaned_data.csv', index=False)
print("Cleaned data saved to ../data/cleaned_data.csv")