# 📊 Baseline Analysis & Exploration

**Step 5: Explore and Build a Baseline Table**

This notebook covers:
- Loading cleaned data
- Basic exploratory analysis
- Creating baseline metrics for products
- Revenue and transaction analysis
- Saving baseline data for promo simulation

---

In [7]:
from pathlib import Path
def project_root(start: Path = None) -> Path:
    here = start or Path.cwd()
    for p in [here, *here.parents]:
        if (p / "data_raw").exists() or (p / "outputs").exists() or (p / "data_clean").exists():
            return p
    return here
ROOT = project_root()
DATA_CLEAN = ROOT / "data_clean"
OUTPUTS = ROOT / "outputs"
DATA_CLEAN.mkdir(parents=True, exist_ok=True)
OUTPUTS.mkdir(parents=True, exist_ok=True)
print("ROOT:", ROOT)
print("DATA_CLEAN:", DATA_CLEAN)
print("OUTPUTS:", OUTPUTS)


ROOT: /Users/alihasan/retail-pricing-mba
DATA_CLEAN: /Users/alihasan/retail-pricing-mba/data_clean
OUTPUTS: /Users/alihasan/retail-pricing-mba/outputs


In [8]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [9]:
# Load cleaned transactions
df = pd.read_csv('../data_clean/transactions.csv')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

print(f"✅ Cleaned data loaded: {len(df):,} rows × {len(df.columns)} columns")

✅ Cleaned data loaded: 407,664 rows × 10 columns


In [10]:
# Create baseline metrics for each product
top_products = (df.groupby('Description')
                .agg({
                    'StockCode': 'first',
                    'Quantity': 'sum',
                    'TotalPrice': 'sum',
                    'Price': 'mean',
                    'Invoice': 'nunique'
                })
                .rename(columns={
                    'StockCode': 'StockCode',
                    'Quantity': 'units',
                    'TotalPrice': 'revenue',
                    'Price': 'avg_price',
                    'Invoice': 'tx_count'
                })
                .sort_values('revenue', ascending=False))

print(f"✅ Baseline metrics created for {len(top_products):,} products")

✅ Baseline metrics created for 4,405 products


In [11]:
# Add margin assumptions
top_products['margin_rate'] = 0.50  # Assume 50% margin
top_products['baseline_margin'] = top_products['revenue'] * top_products['margin_rate']

print("✅ Margin assumptions added (50% margin rate)")

✅ Margin assumptions added (50% margin rate)


In [12]:
# Save baseline data
top_products.to_csv('../outputs/sku_baseline.csv')
top_500_products = top_products.head(500)
top_500_products.to_csv('../outputs/top_500_products.csv')

print("✅ Baseline data saved to outputs/sku_baseline.csv")
print("✅ Top 500 products saved to outputs/top_500_products.csv")

✅ Baseline data saved to outputs/sku_baseline.csv
✅ Top 500 products saved to outputs/top_500_products.csv
