In [None]:
# -------------------------------------------
# 🛍️ Sales & Profitability Dashboard (India)
# Notebook: 01_clean_sales.ipynb
# Task: Load & Clean Indian Store Data
# -------------------------------------------

In [None]:
# 📌 Step 1: Import libraries
import pandas as pd
import numpy as np

In [None]:
# 📌 Step 2: Load dataset
df = pd.read_csv("store_data.csv")   # adjust path if needed
print("Dataset shape:", df.shape)
df.head()



In [None]:
# 📌 Step 3: Quick overview
df.info()
df.describe(include="all")

In [None]:
# 📌 Step 4: Check for duplicates
print("Duplicates:", df.duplicated().sum()

In [None]:
# 📌 Step 5: Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print("Columns:", df.columns.tolist())

In [None]:
# 📌 Step 6: Handle missing values
df = df.dropna(subset=["sales", "profit"])       # drop rows missing key metrics
df["category"] = df["category"].fillna("Unknown")
df["region"] = df["region"].fillna("Unknown")

In [None]:
# 📌 Step 7: Convert order_date to datetime
if "order_date" in df.columns:
    df["order_date"] = pd.to_datetime(df["order_date"], errors="coerce")

In [None]:
# 📌 Step 8: Basic sanity checks
print("Date range:", df["order_date"].min(), "→", df["order_date"].max())
print("Total sales:", round(df["sales"].sum(), 2))
print("Total profit:", round(df["profit"].sum(), 2))

In [None]:

# 📌 Step 9: Save cleaned dataset
df.to_csv("sales_clean.csv", index=False)
print("✅ Cleaned dataset saved as sales_clean.csv")