## Análise de e-commerce brasileiro - Olist Dataset
### Limpeza e preparação de dados
Equipe: André Braga & Cecília Medeiros.

Imports e Configuração inicial

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuração de visualização
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

Carregando dados

In [None]:
print("="*80)
print("ANÁLISE DE E-COMMERCE BRASILEIRO - OLIST DATASET")
print("Equipe Cedré")
print("="*80)
print()

print("1. CARREGANDO DADOS...")
print("-"*80)

orders = pd.read_csv('data/olist_orders_dataset.csv')
order_items = pd.read_csv('data/olist_order_items_dataset.csv')
order_payments = pd.read_csv('data/olist_order_payments_dataset.csv')
customers = pd.read_csv('data/olist_customers_dataset.csv')
products = pd.read_csv('data/olist_products_dataset.csv')
sellers = pd.read_csv('data/olist_sellers_dataset.csv')
category_translation = pd.read_csv('data/product_category_name_translation.csv')

print(f"✓ Orders: {orders.shape[0]:,} registros, {orders.shape[1]} colunas")
print(f"✓ Order Items: {order_items.shape[0]:,} registros, {order_items.shape[1]} colunas")
print(f"✓ Order Payments: {order_payments.shape[0]:,} registros, {order_payments.shape[1]} colunas")
print(f"✓ Customers: {customers.shape[0]:,} registros, {customers.shape[1]} colunas")
print(f"✓ Products: {products.shape[0]:,} registros, {products.shape[1]} colunas")
print(f"✓ Sellers: {sellers.shape[0]:,} registros, {sellers.shape[1]} colunas")
print()

NameError: name 'pd' is not defined

Análise de qualidade de dados

In [4]:
print("2. ANÁLISE DE QUALIDADE DOS DADOS")
print("-"*80)

print("\n2.1. Valores Ausentes (Orders):")
missing_orders = orders.isnull().sum()
missing_pct = (missing_orders / len(orders) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing': missing_orders,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing'] > 0])

print("\n2.2. Valores Ausentes (Order Items):")
missing_items = order_items.isnull().sum()
print(f"Total missing: {missing_items.sum()}")

print("\n2.3. Duplicatas:")
print(f"Orders duplicados: {orders.duplicated(subset='order_id').sum()}")
print(f"Customers duplicados: {customers.duplicated(subset='customer_id').sum()}")
print(f"Products duplicados: {products.duplicated(subset='product_id').sum()}")
print()

print("3. CONVERSÃO DE TIPOS DE DADOS")
print("-"*80)

date_columns = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]

for col in date_columns:
    orders[col] = pd.to_datetime(orders[col], errors='coerce')
    print(f"✓ Convertido: {col}")

order_items['shipping_limit_date'] = pd.to_datetime(order_items['shipping_limit_date'], errors='coerce')
print(f"✓ Convertido: shipping_limit_date")
print()

print("4. TRATAMENTO DE STRINGS")
print("-"*80)



2. ANÁLISE DE QUALIDADE DOS DADOS
--------------------------------------------------------------------------------

2.1. Valores Ausentes (Orders):
                               Missing  Percentage
order_approved_at                  160        0.16
order_delivered_carrier_date      1783        1.79
order_delivered_customer_date     2965        2.98

2.2. Valores Ausentes (Order Items):
Total missing: 0

2.3. Duplicatas:
Orders duplicados: 0
Customers duplicados: 0
Products duplicados: 0

3. CONVERSÃO DE TIPOS DE DADOS
--------------------------------------------------------------------------------
✓ Convertido: order_purchase_timestamp
✓ Convertido: order_approved_at
✓ Convertido: order_delivered_carrier_date
✓ Convertido: order_delivered_customer_date
✓ Convertido: order_estimated_delivery_date
✓ Convertido: shipping_limit_date

4. TRATAMENTO DE STRINGS
--------------------------------------------------------------------------------


Trimming e padronização

In [5]:
orders['order_status'] = orders['order_status'].str.strip().str.lower()
order_payments['payment_type'] = order_payments['payment_type'].str.strip().str.lower()
customers['customer_state'] = customers['customer_state'].str.strip().str.upper()
sellers['seller_state'] = sellers['seller_state'].str.strip().str.upper()

print("Strings padronizadas (trim, case)")
print()

Strings padronizadas (trim, case)



Merge de datasets

In [6]:
print("5. MERGE DOS DATASETS")
print("-"*80)

# orders + order_items + order_payments
df = orders.merge(order_items, on='order_id', how='left')
print(f"Merged orders + order_items: {df.shape[0]:,} registros")

payments_agg = order_payments.groupby('order_id').agg({
    'payment_type': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0],
    'payment_value': 'sum',
    'payment_installments': 'max'
}).reset_index()

df = df.merge(payments_agg, on='order_id', how='left')
print(f"Merged payments: {df.shape[0]:,} registros")

df = df.merge(customers[['customer_id', 'customer_state', 'customer_city']], 
              on='customer_id', how='left')
print(f"Merged customers: {df.shape[0]:,} registros")

products_merged = products.merge(category_translation, 
                                  left_on='product_category_name',
                                  right_on='product_category_name',
                                  how='left')
df = df.merge(products_merged[['product_id', 'product_category_name_english']], 
              on='product_id', how='left')
print(f"Merged products: {df.shape[0]:,} registros")

df = df.merge(sellers[['seller_id', 'seller_state', 'seller_city']], 
              on='seller_id', how='left')
print(f"Merged sellers: {df.shape[0]:,} registros")
print()

5. MERGE DOS DATASETS
--------------------------------------------------------------------------------
Merged orders + order_items: 113,425 registros
Merged payments: 113,425 registros
Merged customers: 113,425 registros
Merged products: 113,425 registros
Merged sellers: 113,425 registros



Feature Engineering

In [7]:
print("6. FEATURE ENGINEERING")
print("-"*80)

# 6.1. Métricas de receita
df['subtotal'] = df['price']
df['freight'] = df['freight_value']
df['total'] = df['price'] + df['freight_value']

print("Criadas: subtotal, freight, total")

# 6.2. Métricas de entrega
df['delivery_lead_time'] = (df['order_delivered_customer_date'] - 
                             df['order_purchase_timestamp']).dt.days

df['delivery_delay_days'] = (df['order_delivered_customer_date'] - 
                               df['order_estimated_delivery_date']).dt.days

df['is_late'] = (df['delivery_delay_days'] > 0).astype(int)

print("Criadas: delivery_lead_time, delivery_delay_days, is_late")

# 6.3. Status de confirmação
df['is_confirmed'] = (df['order_status'] == 'delivered').astype(int)
df['is_canceled'] = (df['order_status'] == 'canceled').astype(int)

print("Criadas: is_confirmed, is_canceled")

# 6.4. Take-rate de frete
df['freight_share'] = df['freight'] / df['total']
df['freight_share'] = df['freight_share'].replace([np.inf, -np.inf], np.nan)

print("Criada: freight_share")

# 6.5. Temporal features
df['order_month'] = df['order_purchase_timestamp'].dt.to_period('M')
df['order_year'] = df['order_purchase_timestamp'].dt.year
df['order_month_num'] = df['order_purchase_timestamp'].dt.month
df['order_weekday'] = df['order_purchase_timestamp'].dt.dayofweek
df['order_hour'] = df['order_purchase_timestamp'].dt.hour

print("Criadas: order_month, order_year, order_month_num, order_weekday, order_hour")

# 6.6. Região geográfica (simplificada)
regiao_map = {
    'SP': 'Sudeste', 'RJ': 'Sudeste', 'MG': 'Sudeste', 'ES': 'Sudeste',
    'PR': 'Sul', 'SC': 'Sul', 'RS': 'Sul',
    'BA': 'Nordeste', 'PE': 'Nordeste', 'CE': 'Nordeste', 'MA': 'Nordeste',
    'RN': 'Nordeste', 'PB': 'Nordeste', 'SE': 'Nordeste', 'AL': 'Nordeste', 'PI': 'Nordeste',
    'GO': 'Centro-Oeste', 'MT': 'Centro-Oeste', 'MS': 'Centro-Oeste', 'DF': 'Centro-Oeste',
    'AM': 'Norte', 'PA': 'Norte', 'RO': 'Norte', 'AC': 'Norte', 'RR': 'Norte', 
    'AP': 'Norte', 'TO': 'Norte'
}
df['customer_region'] = df['customer_state'].map(regiao_map)

print("Criada: customer_region")
print()

6. FEATURE ENGINEERING
--------------------------------------------------------------------------------
Criadas: subtotal, freight, total
Criadas: delivery_lead_time, delivery_delay_days, is_late
Criadas: is_confirmed, is_canceled
Criada: freight_share
Criadas: order_month, order_year, order_month_num, order_weekday, order_hour
Criada: customer_region



Tratamento de Outliers

In [8]:
print("7. IDENTIFICAÇÃO DE OUTLIERS")
print("-"*80)

def detect_outliers_iqr(data, column):
    
    """
        Detecta outliers usando método IQR
    """

    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = ((data[column] < lower_bound) | (data[column] > upper_bound)).sum()
    return outliers, lower_bound, upper_bound

# Análise de outliers em variáveis chave
outlier_cols = ['total', 'freight', 'delivery_lead_time', 'payment_value']
for col in outlier_cols:
    if col in df.columns:
        n_outliers, lower, upper = detect_outliers_iqr(df.dropna(subset=[col]), col)
        pct = (n_outliers / df[col].notna().sum() * 100)
        print(f"{col}: {n_outliers:,} outliers ({pct:.2f}%) | Range: [{lower:.2f}, {upper:.2f}]")

print("\nOutliers identificados, mas mantidos para análise completa")
print()

7. IDENTIFICAÇÃO DE OUTLIERS
--------------------------------------------------------------------------------
total: 8,253 outliers (7.33%) | Range: [-98.86, 312.01]
freight: 12,134 outliers (10.77%) | Range: [0.98, 33.25]
delivery_lead_time: 5,560 outliers (5.05%) | Range: [-7.50, 28.50]
payment_value: 9,285 outliers (8.19%) | Range: [-128.84, 389.88]

Outliers identificados, mas mantidos para análise completa



Filtros e limpeza final

In [9]:
print("8. FILTROS E LIMPEZA FINAL")
print("-"*80)

# Remover pedidos sem informações críticas
df_clean = df.copy()
initial_count = len(df_clean)

# Manter apenas pedidos com status válido
valid_status = ['delivered', 'canceled', 'shipped', 'invoiced', 'processing']
df_clean = df_clean[df_clean['order_status'].isin(valid_status)]
print(f"Filtrado por status válido: {initial_count:,} → {len(df_clean):,}")

# Remover registros com valores negativos
df_clean = df_clean[df_clean['price'] >= 0]
df_clean = df_clean[df_clean['freight_value'] >= 0]
print(f"Removidos valores negativos: {len(df_clean):,} registros")

print()

8. FILTROS E LIMPEZA FINAL
--------------------------------------------------------------------------------
Filtrado por status válido: 113,425 → 112,807
Removidos valores negativos: 112,640 registros



Salvar dados limpos

In [10]:
print("9. SALVANDO DADOS LIMPOS")
print("-"*80)

df_clean.to_csv('data_cleaned.csv', index=False)
print(f"✓ Arquivo salvo: data_cleaned.csv ({len(df_clean):,} registros)")

# Salvar também versão agregada por pedido
df_orders = df_clean.groupby('order_id').agg({
    'customer_id': 'first',
    'order_status': 'first',
    'order_purchase_timestamp': 'first',
    'order_delivered_customer_date': 'first',
    'order_estimated_delivery_date': 'first',
    'payment_type': 'first',
    'payment_value': 'first',
    'payment_installments': 'first',
    'customer_state': 'first',
    'customer_region': 'first',
    'total': 'sum',
    'freight': 'sum',
    'subtotal': 'sum',
    'delivery_lead_time': 'first',
    'delivery_delay_days': 'first',
    'is_late': 'first',
    'is_confirmed': 'first',
    'is_canceled': 'first',
    'order_month': 'first',
    'order_year': 'first'
}).reset_index()

df_orders.to_csv('orders_aggregated.csv', index=False)
print(f"✓ Arquivo salvo: orders_aggregated.csv ({len(df_orders):,} pedidos únicos)")
print()

9. SALVANDO DADOS LIMPOS
--------------------------------------------------------------------------------
✓ Arquivo salvo: data_cleaned.csv (112,640 registros)
✓ Arquivo salvo: orders_aggregated.csv (98,658 pedidos únicos)



Resumo final

In [11]:
print("10. RESUMO FINAL")
print("="*80)
print(f"Dataset original: {initial_count:,} registros")
print(f"Dataset limpo: {len(df_clean):,} registros")
print(f"Pedidos únicos: {len(df_orders):,}")
print(f"Período: {df_clean['order_purchase_timestamp'].min()} a {df_clean['order_purchase_timestamp'].max()}")
print(f"Estados: {df_clean['customer_state'].nunique()}")
print(f"Categorias: {df_clean['product_category_name_english'].nunique()}")
print("="*80)
print("✓ LIMPEZA E PREPARAÇÃO CONCLUÍDA COM SUCESSO!")
print("="*80)


10. RESUMO FINAL
Dataset original: 113,425 registros
Dataset limpo: 112,640 registros
Pedidos únicos: 98,658
Período: 2016-09-04 21:15:19 a 2018-09-03 09:06:57
Estados: 27
Categorias: 71
✓ LIMPEZA E PREPARAÇÃO CONCLUÍDA COM SUCESSO!
