In [1]:
# Cell 1 — Title & meta
# Capstone: Retail EDA — Online Retail (example)
# Author: Your Name | Date: 2025-12-12

# Cell 2 — Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Cell 3 — Settings
%matplotlib inline
sns.set(style="whitegrid")
pd.options.display.max_columns = 50

# Cell 4 — Load data
DATA_DIR = Path("data")
df = pd.read_csv(DATA_DIR/"online_retail.csv", encoding='ISO-8859-1', parse_dates=['InvoiceDate'])

# Cell 5 — Quick peek
df.shape, df.head()

# Cell 6 — Initial cleaning (duplicates, NaNs, negative quantities)
df = df.drop_duplicates()
df = df[df['Quantity'] > 0]                # drop returns/negative as needed
df = df.dropna(subset=['CustomerID'])      # keep only rows with customers if analysis needs it
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Cell 7 — Convert types & add columns
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['InvoiceYearMonth'] = df['InvoiceDate'].dt.to_period('M').astype(str)
df['InvoiceDateOnly'] = df['InvoiceDate'].dt.date

# Cell 8 — Basic aggregations / KPIs
total_revenue = df['TotalPrice'].sum()
num_orders = df['InvoiceNo'].nunique()
num_customers = df['CustomerID'].nunique()
avg_order_value = df.groupby('InvoiceNo')['TotalPrice'].sum().mean()

print(f"Revenue: {total_revenue:.2f}, Orders: {num_orders}, Customers: {num_customers}, AOV: {avg_order_value:.2f}")

# Cell 9 — Top products
top_products = df.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(10).reset_index()
top_products

# Cell 10 — Monthly revenue time series
monthly_rev = df.groupby('InvoiceYearMonth')['TotalPrice'].sum().reset_index()
monthly_rev['InvoiceYearMonth'] = pd.to_datetime(monthly_rev['InvoiceYearMonth'] + '-01')

# Cell 11 — Visualization 1: Monthly revenue line chart
plt.figure(figsize=(10,5))
plt.plot(monthly_rev['InvoiceYearMonth'], monthly_rev['TotalPrice'])
plt.title("Monthly Revenue")
plt.ylabel("Revenue")
plt.xlabel("Month")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("outputs/monthly_revenue.png", dpi=150)

# Cell 12 — Visualization 2: Top 10 products bar chart
plt.figure(figsize=(10,6))
sns.barplot(data=top_products, y='Description', x='Quantity', orient='h')
plt.title("Top 10 Products by Quantity Sold")
plt.xlabel("Quantity Sold")
plt.ylabel("")
plt.tight_layout()
plt.savefig("outputs/top_products.png", dpi=150)

# Cell 13 — Cohort or RFM example (optional)
# Quick RFM computation
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
}).rename(columns={'InvoiceDate':'Recency','InvoiceNo':'Frequency','TotalPrice':'Monetary'})
rfm.head()

# Cell 14 — Short conclusions & actionable insights (as markdown in notebook)


FileNotFoundError: [Errno 2] No such file or directory: 'data\\online_retail.csv'