# Data Exploration Notebook

This notebook explores the sales data to understand patterns, trends, and characteristics.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load data
data_path = Path('../data/raw/sales_data.csv')
df = pd.read_csv(data_path)

print(f"Data shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
df.head()

In [None]:
# Data info
df.info()
df.describe()

In [None]:
# Convert date column
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index()

print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Total days: {(df.index.max() - df.index.min()).days}")

In [None]:
# Visualize sales over time
if 'sales' in df.columns:
    fig = px.line(df.reset_index(), x='date', y='sales', 
                 title='Sales Over Time',
                 labels={'sales': 'Sales ($)', 'date': 'Date'})
    fig.show()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:")
print(missing_values[missing_values > 0])

In [None]:
# Distribution of sales
if 'sales' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    df['sales'].hist(bins=50, ax=axes[0])
    axes[0].set_title('Sales Distribution')
    axes[0].set_xlabel('Sales ($)')
    axes[0].set_ylabel('Frequency')
    
    df['sales'].plot(kind='box', ax=axes[1])
    axes[1].set_title('Sales Box Plot')
    axes[1].set_ylabel('Sales ($)')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Seasonal patterns
if 'sales' in df.columns:
    df['month'] = df.index.month
    df['day_of_week'] = df.index.dayofweek
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    monthly_avg = df.groupby('month')['sales'].mean()
    monthly_avg.plot(kind='bar', ax=axes[0])
    axes[0].set_title('Average Sales by Month')
    axes[0].set_xlabel('Month')
    axes[0].set_ylabel('Average Sales ($)')
    
    daily_avg = df.groupby('day_of_week')['sales'].mean()
    daily_avg.plot(kind='bar', ax=axes[1])
    axes[1].set_title('Average Sales by Day of Week')
    axes[1].set_xlabel('Day of Week (0=Monday)')
    axes[1].set_ylabel('Average Sales ($)')
    
    plt.tight_layout()
    plt.show()