# Brazilian E-commerce Data Exploration
## Phase 1: Understanding the Olist Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path

# Set up plotting
plt.style.use('default')
sns.set_palette('husl')
%matplotlib inline

## 1. Load All Dataset Files
The Brazilian e-commerce dataset contains 9 CSV files. Let's load and explore each one.

In [2]:
# Define data directory
data_dir = Path('../data/raw')

# Load all datasets
datasets = {}
file_list = [
    'olist_orders_dataset.csv',
    'olist_customers_dataset.csv',
    'olist_products_dataset.csv',
    'olist_order_items_dataset.csv',
    'olist_order_reviews_dataset.csv',
    'olist_sellers_dataset.csv',
    'product_category_name_translation.csv',
    'olist_geolocation_dataset.csv',
    'olist_order_payments_dataset.csv'
]

for file in file_list:
    try:
        df_name = file.replace('olist_', '').replace('_dataset.csv', '').replace('.csv', '')
        datasets[df_name] = pd.read_csv(data_dir / file)
        print(f"‚úÖ Loaded {file}: {datasets[df_name].shape}")
    except FileNotFoundError:
        print(f"‚ùå File not found: {file}")
        print(f"   Please download from: https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce")
        print(f"   And extract to: {data_dir}")

‚úÖ Loaded olist_orders_dataset.csv: (99441, 8)
‚úÖ Loaded olist_customers_dataset.csv: (99441, 5)
‚úÖ Loaded olist_products_dataset.csv: (32951, 9)
‚úÖ Loaded olist_customers_dataset.csv: (99441, 5)
‚úÖ Loaded olist_products_dataset.csv: (32951, 9)
‚úÖ Loaded olist_order_items_dataset.csv: (112650, 7)
‚úÖ Loaded olist_order_items_dataset.csv: (112650, 7)
‚úÖ Loaded olist_order_reviews_dataset.csv: (99224, 7)
‚úÖ Loaded olist_sellers_dataset.csv: (3095, 4)
‚úÖ Loaded product_category_name_translation.csv: (71, 2)
‚úÖ Loaded olist_order_reviews_dataset.csv: (99224, 7)
‚úÖ Loaded olist_sellers_dataset.csv: (3095, 4)
‚úÖ Loaded product_category_name_translation.csv: (71, 2)
‚úÖ Loaded olist_geolocation_dataset.csv: (1000163, 5)
‚úÖ Loaded olist_geolocation_dataset.csv: (1000163, 5)
‚úÖ Loaded olist_order_payments_dataset.csv: (103886, 5)
‚úÖ Loaded olist_order_payments_dataset.csv: (103886, 5)


## 2. Dataset Overview
Let's examine the structure and size of each dataset.

In [3]:
# Overview of all datasets
overview_data = []
for name, df in datasets.items():
    overview_data.append({
        'Dataset': name,
        'Rows': f"{df.shape[0]:,}",
        'Columns': df.shape[1],
        'Memory (MB)': f"{df.memory_usage(deep=True).sum() / 1024**2:.1f}"
    })

overview_df = pd.DataFrame(overview_data)
print("üìä Dataset Overview:")
print(overview_df.to_string(index=False))

üìä Dataset Overview:
                          Dataset      Rows  Columns Memory (MB)
                           orders    99,441        8        52.9
                        customers    99,441        5        26.6
                         products    32,951        9         6.3
                      order_items   112,650        7        36.0
                    order_reviews    99,224        7        39.1
                          sellers     3,095        4         0.6
product_category_name_translation        71        2         0.0
                      geolocation 1,000,163        5       129.4
                   order_payments   103,886        5        16.2


## 3. Orders Analysis - Core Business Data

In [4]:
if 'orders' in datasets:
    orders = datasets['orders']
    print("üõí Orders Dataset Structure:")
    print(orders.info())
    print("\nüìà Sample Data:")
    print(orders.head())

üõí Orders Dataset Structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB
None

üìà Sample Data:
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8

## 4. Geographic Distribution - Perfect for Multi-Region
Brazilian states will represent our 'regions' for the multi-region deployment.

In [5]:
if 'customers' in datasets:
    customers = datasets['customers']
    
    # Geographic distribution
    state_dist = customers['customer_state'].value_counts()
    print("üó∫Ô∏è Customer Distribution by State:")
    print(state_dist.head(10))
    
    # Visualize top states
    fig = px.bar(
        x=state_dist.head(10).index,
        y=state_dist.head(10).values,
        title="Top 10 States by Customer Count",
        labels={'x': 'State', 'y': 'Number of Customers'}
    )
    fig.show()
    
    # Define regions for multi-region simulation
    region_mapping = {
        'SP': 'Southeast',  # S√£o Paulo - Primary Region
        'RJ': 'Southeast',  # Rio de Janeiro
        'MG': 'Southeast',  # Minas Gerais
        'RS': 'South',      # Rio Grande do Sul - Secondary Region
        'PR': 'South',      # Paran√°
        'SC': 'South',      # Santa Catarina
        'BA': 'Northeast',  # Bahia
        'GO': 'Central-West', # Goi√°s
        'DF': 'Central-West'  # Distrito Federal
    }
    
    print("\nüåé Regional Mapping for Multi-Region Architecture:")
    for state, region in region_mapping.items():
        count = state_dist.get(state, 0)
        print(f"  {state} ({region}): {count:,} customers")

üó∫Ô∏è Customer Distribution by State:
customer_state
SP    41746
RJ    12852
MG    11635
RS     5466
PR     5045
SC     3637
BA     3380
DF     2140
ES     2033
GO     2020
Name: count, dtype: int64



üåé Regional Mapping for Multi-Region Architecture:
  SP (Southeast): 41,746 customers
  RJ (Southeast): 12,852 customers
  MG (Southeast): 11,635 customers
  RS (South): 5,466 customers
  PR (South): 5,045 customers
  SC (South): 3,637 customers
  BA (Northeast): 3,380 customers
  GO (Central-West): 2,020 customers
  DF (Central-West): 2,140 customers


## 5. Product Catalog Analysis

In [6]:
if 'products' in datasets and 'product_category_name_translation' in datasets:
    products = datasets['products']
    translations = datasets['product_category_name_translation']
    
    # Merge with translations for English category names
    products_with_english = products.merge(
        translations, 
        left_on='product_category_name', 
        right_on='product_category_name_portuguese',
        how='left'
    )
    
    print("üì¶ Product Categories (Top 15):")
    category_counts = products_with_english['product_category_name_english'].value_counts()
    print(category_counts.head(15))
    
    # Visualize categories
    fig = px.bar(
        x=category_counts.head(15).values,
        y=category_counts.head(15).index,
        orientation='h',
        title="Top 15 Product Categories",
        labels={'x': 'Number of Products', 'y': 'Category'}
    )
    fig.show()

KeyError: 'product_category_name_portuguese'

## 6. Order Timeline - Data for Recommendation Engine

In [7]:
if 'orders' in datasets:
    orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])
    
    # Timeline analysis
    orders_by_month = orders.set_index('order_purchase_timestamp').resample('M').size()
    
    print("üìÖ Order Volume by Month:")
    print(orders_by_month.tail(12))
    
    # Plot timeline
    fig = px.line(
        x=orders_by_month.index,
        y=orders_by_month.values,
        title="Order Volume Over Time",
        labels={'x': 'Date', 'y': 'Number of Orders'}
    )
    fig.show()

üìÖ Order Volume by Month:
order_purchase_timestamp
2017-11-30    7544
2017-12-31    5673
2018-01-31    7269
2018-02-28    6728
2018-03-31    7211
2018-04-30    6939
2018-05-31    6873
2018-06-30    6167
2018-07-31    6292
2018-08-31    6512
2018-09-30      16
2018-10-31       4
Freq: ME, dtype: int64



'M' is deprecated and will be removed in a future version, please use 'ME' instead.



## 7. Reviews & Ratings - Recommendation System Data

In [8]:
if 'order_reviews' in datasets:
    reviews = datasets['order_reviews']
    
    print("‚≠ê Review Score Distribution:")
    rating_dist = reviews['review_score'].value_counts().sort_index()
    print(rating_dist)
    
    # Visualize ratings
    fig = px.bar(
        x=rating_dist.index,
        y=rating_dist.values,
        title="Review Score Distribution",
        labels={'x': 'Rating (1-5 stars)', 'y': 'Number of Reviews'}
    )
    fig.show()
    
    print(f"\nüìä Review Statistics:")
    print(f"  Total Reviews: {len(reviews):,}")
    print(f"  Average Rating: {reviews['review_score'].mean():.2f}")
    print(f"  Rating Standard Deviation: {reviews['review_score'].std():.2f}")

‚≠ê Review Score Distribution:
review_score
1    11424
2     3151
3     8179
4    19142
5    57328
Name: count, dtype: int64



üìä Review Statistics:
  Total Reviews: 99,224
  Average Rating: 4.09
  Rating Standard Deviation: 1.35


## 8. Data Quality Assessment

In [9]:
print("üîç Data Quality Assessment:")
print("=" * 50)

for name, df in datasets.items():
    null_counts = df.isnull().sum()
    null_percentage = (null_counts / len(df)) * 100
    
    print(f"\nüìã {name.upper()}:")
    print(f"  Total Rows: {len(df):,}")
    print(f"  Columns with Missing Data:")
    
    missing_data = null_percentage[null_percentage > 0].sort_values(ascending=False)
    if len(missing_data) == 0:
        print("    ‚úÖ No missing data!")
    else:
        for col, pct in missing_data.head(5).items():
            print(f"    - {col}: {pct:.1f}% missing")

üîç Data Quality Assessment:

üìã ORDERS:
  Total Rows: 99,441
  Columns with Missing Data:
    - order_delivered_customer_date: 3.0% missing
    - order_delivered_carrier_date: 1.8% missing
    - order_approved_at: 0.2% missing

üìã CUSTOMERS:
  Total Rows: 99,441
  Columns with Missing Data:
    ‚úÖ No missing data!

üìã PRODUCTS:
  Total Rows: 32,951
  Columns with Missing Data:
    - product_category_name: 1.9% missing
    - product_name_lenght: 1.9% missing
    - product_description_lenght: 1.9% missing
    - product_photos_qty: 1.9% missing
    - product_weight_g: 0.0% missing

üìã ORDER_ITEMS:
  Total Rows: 112,650
  Columns with Missing Data:
    ‚úÖ No missing data!

üìã ORDER_REVIEWS:
  Total Rows: 99,224
  Columns with Missing Data:
    - review_comment_title: 88.3% missing
    - review_comment_message: 58.7% missing

üìã SELLERS:
  Total Rows: 3,095
  Columns with Missing Data:
    ‚úÖ No missing data!

üìã PRODUCT_CATEGORY_NAME_TRANSLATION:
  Total Rows: 71
  Colum

## 9. Key Insights for Multi-Region Architecture

In [11]:
print("üéØ KEY INSIGHTS FOR MULTI-REGION PROJECT:")
print("=" * 60)

if 'orders' in datasets and 'customers' in datasets:
    # Merge orders with customers for regional analysis
    order_customers = orders.merge(customers, on='customer_id', how='left')
    
    print("\nüìç REGIONAL DISTRIBUTION:")
    regional_orders = order_customers['customer_state'].value_counts()
    
    # Define primary and secondary regions
    southeast_states = ['SP', 'RJ', 'MG', 'ES']
    south_states = ['RS', 'PR', 'SC']
    
    southeast_orders = regional_orders[regional_orders.index.isin(southeast_states)].sum()
    south_orders = regional_orders[regional_orders.index.isin(south_states)].sum()
    
    print(f"  üè¢ PRIMARY REGION (Southeast): {southeast_orders:,} orders ({southeast_orders/len(orders)*100:.1f}%)")
    print(f"  üè¢ SECONDARY REGION (South): {south_orders:,} orders ({south_orders/len(orders)*100:.1f}%)")
    
    print("\nüí° ARCHITECTURE RECOMMENDATIONS:")
    print(f"  - Deploy primary MinIO cluster for Southeast region")
    print(f"  - Deploy secondary MinIO cluster for South region")
    print(f"  - Implement regional routing based on customer_state")
    print(f"  - Use {southeast_orders:,} Southeast orders for primary region testing")
    print(f"  - Use {south_orders:,} South orders for secondary region testing")

if 'order_reviews' in datasets:
    print("\nü§ñ RECOMMENDATION SYSTEM DATA:")
    print(f"  - {len(reviews):,} customer reviews (ratings 1-5)")
    print(f"  - Average rating: {reviews['review_score'].mean():.2f}/5.0")
    print(f"  - Perfect for collaborative filtering algorithms")


üéØ KEY INSIGHTS FOR MULTI-REGION PROJECT:

üìç REGIONAL DISTRIBUTION:
  üè¢ PRIMARY REGION (Southeast): 68,266 orders (68.6%)
  üè¢ SECONDARY REGION (South): 14,148 orders (14.2%)

üí° ARCHITECTURE RECOMMENDATIONS:
  - Deploy primary MinIO cluster for Southeast region
  - Deploy secondary MinIO cluster for South region
  - Implement regional routing based on customer_state
  - Use 68,266 Southeast orders for primary region testing
  - Use 14,148 South orders for secondary region testing

ü§ñ RECOMMENDATION SYSTEM DATA:
  - 99,224 customer reviews (ratings 1-5)
  - Average rating: 4.09/5.0
  - Perfect for collaborative filtering algorithms
