# Retail Data Exploration & Profiling

This notebook performs initial data exploration and profiling of the retail dataset to understand data quality, distributions, and key patterns.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

## 1. Data Loading & Initial Assessment

In [None]:
# Load sample retail data (replace with actual data source)
# For demo purposes, creating synthetic data structure

# Generate sample data
np.random.seed(42)
n_records = 10000

# Create sample retail dataset
data = {
    'transaction_id': range(1, n_records + 1),
    'date': pd.date_range('2022-01-01', periods=n_records, freq='H'),
    'customer_id': np.random.randint(1, 2000, n_records),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Sports', 'Books'], n_records),
    'product_name': [f'Product_{i}' for i in np.random.randint(1, 500, n_records)],
    'quantity': np.random.randint(1, 10, n_records),
    'unit_price': np.random.uniform(10, 500, n_records),
    'discount_percent': np.random.uniform(0, 30, n_records),
    'sales_channel': np.random.choice(['Online', 'In-Store', 'Mobile'], n_records),
    'region': np.random.choice(['North', 'South', 'East', 'West'], n_records)
}

df = pd.DataFrame(data)
df['total_amount'] = df['quantity'] * df['unit_price'] * (1 - df['discount_percent']/100)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
df.head()

## 2. Data Quality Assessment

In [None]:
# Data quality checks
print("=== DATA QUALITY REPORT ===")
print(f"Total records: {len(df):,}")
print(f"Duplicate records: {df.duplicated().sum():,}")
print(f"Missing values per column:")
print(df.isnull().sum())
print(f"\nData types:")
print(df.dtypes)

## 3. Descriptive Statistics

In [None]:
# Numerical columns summary
numerical_cols = ['quantity', 'unit_price', 'discount_percent', 'total_amount']
print("=== NUMERICAL VARIABLES SUMMARY ===")
df[numerical_cols].describe()

## 4. Key Business Metrics

In [None]:
# Calculate key business metrics
total_revenue = df['total_amount'].sum()
avg_order_value = df['total_amount'].mean()
unique_customers = df['customer_id'].nunique()
unique_products = df['product_name'].nunique()
avg_discount = df['discount_percent'].mean()

print("=== KEY BUSINESS METRICS ===")
print(f"Total Revenue: ${total_revenue:,.2f}")
print(f"Average Order Value: ${avg_order_value:.2f}")
print(f"Unique Customers: {unique_customers:,}")
print(f"Unique Products: {unique_products:,}")
print(f"Average Discount: {avg_discount:.1f}%")
print(f"Total Transactions: {len(df):,}")

## 5. Data Visualization

In [None]:
# Revenue by category
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Revenue by category
category_revenue = df.groupby('product_category')['total_amount'].sum().sort_values(ascending=False)
axes[0,0].bar(category_revenue.index, category_revenue.values)
axes[0,0].set_title('Revenue by Product Category')
axes[0,0].set_ylabel('Revenue ($)')
axes[0,0].tick_params(axis='x', rotation=45)

# Sales channel distribution
channel_counts = df['sales_channel'].value_counts()
axes[0,1].pie(channel_counts.values, labels=channel_counts.index, autopct='%1.1f%%')
axes[0,1].set_title('Sales Channel Distribution')

# Order value distribution
axes[1,0].hist(df['total_amount'], bins=50, alpha=0.7)
axes[1,0].set_title('Order Value Distribution')
axes[1,0].set_xlabel('Order Value ($)')
axes[1,0].set_ylabel('Frequency')

# Regional performance
region_revenue = df.groupby('region')['total_amount'].sum()
axes[1,1].bar(region_revenue.index, region_revenue.values)
axes[1,1].set_title('Revenue by Region')
axes[1,1].set_ylabel('Revenue ($)')

plt.tight_layout()
plt.show()

## 6. Time Series Analysis

In [None]:
# Daily revenue trend
df['date_only'] = df['date'].dt.date
daily_revenue = df.groupby('date_only')['total_amount'].sum().reset_index()

plt.figure(figsize=(15, 6))
plt.plot(daily_revenue['date_only'], daily_revenue['total_amount'])
plt.title('Daily Revenue Trend')
plt.xlabel('Date')
plt.ylabel('Revenue ($)')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.show()

## 7. Customer Analysis

In [None]:
# Customer purchase behavior
customer_metrics = df.groupby('customer_id').agg({
    'total_amount': ['sum', 'mean', 'count'],
    'date': ['min', 'max']
}).round(2)

customer_metrics.columns = ['total_spent', 'avg_order_value', 'order_count', 'first_purchase', 'last_purchase']
customer_metrics['customer_lifetime_days'] = (customer_metrics['last_purchase'] - customer_metrics['first_purchase']).dt.days

print("=== CUSTOMER BEHAVIOR SUMMARY ===")
print(customer_metrics.describe())

## 8. Next Steps

Based on this initial exploration, the next analysis steps include:

1. **Customer Segmentation** - RFM analysis and clustering
2. **Sales Forecasting** - Time series modeling
3. **Product Analysis** - Performance and profitability
4. **Market Basket Analysis** - Association rules
5. **Dashboard Creation** - Interactive visualizations

Each of these will be covered in separate notebooks.