In [1]:
import pandas as pd
from sqlalchemy import create_engine, inspect, text

# Database Connection

Connect to the PostgreSQL database with the provided credentials. SQLAlchemy will be used for database operations in this notebook.

In [2]:
# Connect to PostgreSQL using SQLAlchemy
try:
    engine = create_engine(
        'postgresql://azalea:azalea@localhost:5433/thelook_db'
    )
    # Test the connection
    with engine.connect() as conn:
        conn.execute(text('SELECT 1'))
    print("SQLAlchemy engine created successfully.")
except Exception as e:
    print(f"Failed to connect to database: {e}")
    print("\nPlease ensure that PostgreSQL database is running")
    raise

SQLAlchemy engine created successfully.


In [3]:
inspector = inspect(engine)
table_names = inspector.get_table_names()
table_names

['users',
 'events',
 'orders',
 'distribution_centers',
 'products',
 'inventory_items',
 'order_items']

In [4]:
# Read each table into a pandas DataFrame, assigning to individual variables
users = pd.read_sql_table('users', engine)
events = pd.read_sql_table('events', engine)
orders = pd.read_sql_table('orders', engine)
distribution_centers = pd.read_sql_table('distribution_centers', engine)
products = pd.read_sql_table('products', engine)
inventory_items = pd.read_sql_table('inventory_items', engine)
order_items = pd.read_sql_table('order_items', engine)

print('All tables read successfully.')

All tables read successfully.


In [5]:
# Close all database connections
try:
    if 'engine' in locals():
        engine.dispose()
        print("SQLAlchemy engine disposed.")
except Exception as e:
    print(f"Error disposing SQLAlchemy engine: {e}")

SQLAlchemy engine disposed.


# Exploratory Data Analysis

## Data Overview and Quality Assessment

Understanding the structure and quality of the data.

In [6]:
# Basic information about each dataset
print("Dataset Shapes:")
print(f"Users: {users.shape}")
print(f"Events: {events.shape}")
print(f"Orders: {orders.shape}")
print(f"Order Items: {order_items.shape}")
print(f"Products: {products.shape}")
print(f"Inventory Items: {inventory_items.shape}")
print(f"Distribution Centers: {distribution_centers.shape}")

# Memory usage
print("\nMemory Usage (MB):")
for name, df in [('users', users), ('events', events), ('orders', orders), 
                 ('order_items', order_items), ('products', products), 
                 ('inventory_items', inventory_items), ('distribution_centers', distribution_centers)]:
    print(f"{name}: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Dataset Shapes:
Users: (100000, 16)
Events: (2428216, 13)
Orders: (125278, 9)
Order Items: (181578, 11)
Products: (29120, 9)
Inventory Items: (490176, 12)
Distribution Centers: (10, 5)

Memory Usage (MB):
users: 67.77 MB
events: 1362.93 MB
orders: 19.50 MB
order_items: 23.77 MB
products: 10.62 MB
inventory_items: 189.79 MB
distribution_centers: 0.00 MB


The datasets are large and detailed, especially `events`, which uses the most memory. Most tables have hundreds of thousands of rows, except for `distribution_centers`, which is small. Efficient filtering is important for analysis due to the size of the largest tables. Overall, the data is comprehensive and ready for business analytics.

In [7]:
# Data quality assessment
import numpy as np

def data_quality_summary(df, name):
    print(f"\n=== {name.upper()} DATA QUALITY ===")
    print(f"Shape: {df.shape}")
    print(f"Duplicates: {df.duplicated().sum()}")
    
    # Missing values
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_info = pd.DataFrame({
        'Missing Count': missing,
        'Missing %': missing_pct
    })
    missing_info = missing_info[missing_info['Missing Count'] > 0].sort_values('Missing %', ascending=False)
    
    if not missing_info.empty:
        print("\nMissing Values:")
        print(missing_info)
    else:
        print("\nNo missing values found!")
    
    # Data types
    print("\nData Types:")
    print(df.dtypes.value_counts())
    
    return missing_info

# Check each dataset
for name, df in [('users', users), ('orders', orders), ('order_items', order_items), 
                 ('products', products), ('inventory_items', inventory_items)]:
    data_quality_summary(df, name)


=== USERS DATA QUALITY ===
Shape: (100000, 16)
Duplicates: 0

No missing values found!

Data Types:
object                 11
int64                   2
float64                 2
datetime64[ns, UTC]     1
Name: count, dtype: int64

=== ORDERS DATA QUALITY ===
Shape: (125278, 9)
Duplicates: 0

Missing Values:
              Missing Count  Missing %
returned_at          112833  90.066093
delivered_at          81219  64.831016
shipped_at            43559  34.769872

Data Types:
datetime64[ns, UTC]    4
int64                  3
object                 2
Name: count, dtype: int64

=== ORDER_ITEMS DATA QUALITY ===
Shape: (181578, 11)
Duplicates: 0

Missing Values:
              Missing Count  Missing %
returned_at          163615  90.107282
delivered_at         117660  64.798599
shipped_at            63029  34.711804

Data Types:
int64                  5
datetime64[ns, UTC]    4
object                 1
float64                1
Name: count, dtype: int64

=== PRODUCTS DATA QUALITY ===
Shape: (2

The data quality assessment shows:

- **Missing Values**: Minimal missing data across most tables. Geographic fields (latitude, longitude) and fulfillment dates (shipped_at, delivered_at) show expected gaps. Core business fields (IDs, prices, essential dates) are complete.

- **Data Types**: Appropriate data types throughout - numeric fields for calculations, text fields for categories, and timestamp fields for time series analysis.

- **Duplicates**: No duplicate records found, indicating proper data deduplication.

- **Overall**: The data quality is excellent for business analysis with minimal cleaning required. The completeness of critical fields enables comprehensive examination of customer behavior, product performance, and operational metrics.

Given the missing data patterns, we need to investigate whether these are truly missing values or represent natural business states (e.g., orders not yet shipped, products not yet sold).

In [8]:
# Missing Data Management and Analysis
print("=== MISSING DATA MANAGEMENT STRATEGY ===")
print("\n1. ORDERS TABLE - Analyzing fulfillment workflow missing data")

# Check if missing dates are due to order status rather than true missing data
order_status_analysis = orders.groupby('status').agg({
    'shipped_at': lambda x: x.isnull().sum(),
    'delivered_at': lambda x: x.isnull().sum(), 
    'returned_at': lambda x: x.isnull().sum()
}).astype(int)

order_status_analysis.columns = ['Missing_Shipped', 'Missing_Delivered', 'Missing_Returned']
order_status_analysis['Total_Orders'] = orders.groupby('status').size()

print("\nMissing fulfillment dates by order status:")
print(order_status_analysis)

# Calculate percentages
for col in ['Missing_Shipped', 'Missing_Delivered', 'Missing_Returned']:
    order_status_analysis[f'{col}_Pct'] = (order_status_analysis[col] / order_status_analysis['Total_Orders'] * 100).round(1)

print("\nMissing fulfillment dates by order status (with percentages):")
print(order_status_analysis[['Total_Orders', 'Missing_Shipped', 'Missing_Shipped_Pct', 
                           'Missing_Delivered', 'Missing_Delivered_Pct',
                           'Missing_Returned', 'Missing_Returned_Pct']])

=== MISSING DATA MANAGEMENT STRATEGY ===

1. ORDERS TABLE - Analyzing fulfillment workflow missing data

Missing fulfillment dates by order status:
            Missing_Shipped  Missing_Delivered  Missing_Returned  Total_Orders
status                                                                        
Cancelled             18714              18714             18714         18714
Complete                  0                  0             31614         31614
Processing            24845              24845             24845         24845
Returned                  0                  0                 0         12445
Shipped                   0              37660             37660         37660

Missing fulfillment dates by order status (with percentages):
            Total_Orders  Missing_Shipped  Missing_Shipped_Pct  \
status                                                           
Cancelled          18714            18714                100.0   
Complete           31614              

Analysis reveals that missing fulfillment dates represent the natural order processing workflow rather than data quality issues. Each status shows expected patterns:

- **Cancelled orders**: 100% missing all dates (never entered fulfillment)
- **Processing orders**: 100% missing all dates (awaiting shipment)
- **Shipped orders**: Have shipping dates but 100% missing delivered/returned dates
- **Complete orders**: No missing shipping/delivery dates, but 100% missing return dates
- **Returned orders**: Complete data across all date fields

These patterns confirm the database accurately tracks order lifecycle stages, with "missing" values actually serving as meaningful status indicators rather than data deficiencies.

In [11]:
print("2. ORDER_ITEMS TABLE - Analyzing item-level fulfillment data")

# Analyze missing data in order_items by status
order_items_analysis = order_items.groupby('status').agg({
    'shipped_at': lambda x: x.isnull().sum(),
    'delivered_at': lambda x: x.isnull().sum(),
    'returned_at': lambda x: x.isnull().sum()
}).astype(int)

order_items_analysis.columns = ['Missing_Shipped', 'Missing_Delivered', 'Missing_Returned']
order_items_analysis['Total_Items'] = order_items.groupby('status').size()

print("\nMissing fulfillment dates in order_items by status:")
print(order_items_analysis)

# Check if order_items missing data aligns with parent orders
print("\n\n3. INVENTORY_ITEMS TABLE - Analyzing sold_at missing data")

# For inventory_items, missing sold_at likely means items haven't been sold yet
inventory_sold_analysis = pd.DataFrame({
    'Total_Inventory_Items': [len(inventory_items)],
    'Items_with_sold_at': [inventory_items['sold_at'].notna().sum()],
    'Items_NOT_sold': [inventory_items['sold_at'].isnull().sum()],
    'Percentage_Unsold': [(inventory_items['sold_at'].isnull().sum() / len(inventory_items) * 100).round(1)]
})

print("\nInventory items sold status:")
print(inventory_sold_analysis)

# Calculate and print the sum of Items_with_sold_at and Items_NOT_sold
items_sold = inventory_sold_analysis['Items_with_sold_at'].iloc[0]
items_not_sold = inventory_sold_analysis['Items_NOT_sold'].iloc[0]
total_sum = items_sold + items_not_sold

print(f"\nItems with sold_at: {items_sold:,}")
print(f"Items NOT sold: {items_not_sold:,}")
print(f"Sum of both: {total_sum:,}")
print(f"Total inventory items (verification): {len(inventory_items):,}")
print(f"\nVerification: Sum equals total inventory items: {total_sum == len(inventory_items)}")

2. ORDER_ITEMS TABLE - Analyzing item-level fulfillment data

Missing fulfillment dates in order_items by status:
            Missing_Shipped  Missing_Delivered  Missing_Returned  Total_Items
status                                                                       
Cancelled             27190              27190             27190        27190
Complete                  0                  0             45955        45955
Processing            35839              35839             35839        35839
Returned                  0                  0                 0        17963
Shipped                   0              54631             54631        54631


3. INVENTORY_ITEMS TABLE - Analyzing sold_at missing data

Inventory items sold status:
   Total_Inventory_Items  Items_with_sold_at  Items_NOT_sold  \
0                 490176              181578          308598   

   Percentage_Unsold  
0               63.0  

Items with sold_at: 181,578
Items NOT sold: 308,598
Sum of both: 490,176
T

**Order Items Processing Flow**
- **Cancelled orders**: 100% missing all fulfillment dates (27,190 items)
- **Processing orders**: 100% missing all dates as they haven't been shipped (35,839 items)
- **Shipped orders**: Have shipping dates but await delivery (54,631 items)
- **Complete orders**: Full shipping and delivery information (45,955 items) 
- **Returned items**: Complete history across all date fields (17,963 items)

**Inventory Management Status**
- **37% of inventory sold**: 181,578 items have sold_at dates
- **63% of inventory unsold**: 308,598 items remain available in stock
- **Total inventory**: 490,176 items

These patterns indicate that missing date values represent meaningful business states in the order processing and inventory management workflows rather than data quality issues.

In [12]:
print("4. PRODUCTS TABLE - Analyzing missing product data")

# Analyze missing data patterns in products
products_missing = products.isnull().sum()
products_missing_pct = (products_missing / len(products) * 100).round(2)

products_missing_df = pd.DataFrame({
    'Missing_Count': products_missing,
    'Missing_Percentage': products_missing_pct
})
products_missing_df = products_missing_df[products_missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

print("\nMissing data in products table:")
print(products_missing_df)

# Check if missing data correlates with certain categories or departments
if not products_missing_df.empty:
    print("\nAnalyzing patterns in missing product data:")
    
    # Check missing data by category
    for col in products_missing_df.index:
        if col in products.columns:
            missing_by_category = products.groupby('category')[col].apply(lambda x: x.isnull().sum())
            missing_by_category = missing_by_category[missing_by_category > 0].sort_values(ascending=False)
            
            if not missing_by_category.empty:
                print(f"\n{col} missing by category (top 5):")
                print(missing_by_category.head())
            
            # Check missing data by department
            missing_by_dept = products.groupby('department')[col].apply(lambda x: x.isnull().sum())
            missing_by_dept = missing_by_dept[missing_by_dept > 0].sort_values(ascending=False)
            
            if not missing_by_dept.empty:
                print(f"\n{col} missing by department:")
                print(missing_by_dept)

4. PRODUCTS TABLE - Analyzing missing product data

Missing data in products table:
       Missing_Count  Missing_Percentage
brand             24                0.08
name               2                0.01

Analyzing patterns in missing product data:

brand missing by category (top 5):
category
Intimates            4
Tops & Tees          4
Outerwear & Coats    3
Swim                 2
Accessories          2
Name: brand, dtype: int64

brand missing by department:
department
Men      12
Women    12
Name: brand, dtype: int64

name missing by category (top 5):
category
Intimates            1
Outerwear & Coats    1
Name: name, dtype: int64

name missing by department:
department
Men      1
Women    1
Name: name, dtype: int64


The products table shows excellent data quality with very limited missing information. The few missing values are evenly distributed across departments, suggesting random omissions rather than systematic data issues. table shows excellent data quality with very limited missing information. The few missing values are evenly distributed across departments, suggesting random omissions rather than systematic data issues.