# Análise de Performance Logística e Entregas - Olist E-Commerce

**Autor:** André Bomfim  
**Data:** Novembro 2024  
**Objetivo:** Analisar SLA de entrega, atrasos, correlação com NPS e identificar gargalos logísticos para otimizar experiência do cliente

---

## Índice

1. [Setup e Configuração](#1-setup)
2. [Extração e Cálculo de Prazos](#2-extracao)
3. [SLA Compliance e Distribuição](#3-sla)
4. [Performance por Estado](#4-estados)
5. [Análise de Rotas](#5-rotas)
6. [Correlação Atraso vs NPS](#6-nps)
7. [Análise de Frete](#7-frete)
8. [Insights e Recomendações](#8-insights)

## 1. Setup e Configuração

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from datetime import datetime, timedelta
import warnings
from dotenv import load_dotenv
import os
from scipy.stats import pearsonr
import scipy.stats as stats

warnings.filterwarnings('ignore')

# Configuração de visualização
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Configuração do pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Carregar variáveis de ambiente
load_dotenv()

# Configuração BigQuery
PROJECT_ID = os.getenv('GCP_PROJECT_ID')
DATASET_ID = os.getenv('GCP_DATASET_ID', 'olist_ecommerce')

# Cliente BigQuery
client = bigquery.Client(project=PROJECT_ID)

# Criar diretório para imagens
os.makedirs('../docs/images', exist_ok=True)

# SLA Crítico (encontrado em análises anteriores)
SLA_CRITICAL = 15  # dias - após isso, NPS cai 40%

print(f"✓ Setup completo - Projeto: {PROJECT_ID}, Dataset: {DATASET_ID}")
print(f"✓ SLA Crítico definido: {SLA_CRITICAL} dias")

In [None]:
def query_bigquery(query: str) -> pd.DataFrame:
    """Helper para executar queries no BigQuery"""
    return client.query(query).to_dataframe()

def save_plot(fig, filename):
    """Salvar gráfico com tratamento de erro"""
    try:
        fig.savefig(f'../docs/images/{filename}', dpi=300, bbox_inches='tight')
        print(f"✓ Gráfico salvo: docs/images/{filename}")
    except Exception as e:
        print(f" Erro ao salvar gráfico: {e}")

def get_delay_bucket(delay_days):
    """Função para calcular delay buckets"""
    if delay_days <= 0:
        return '0_on_time'
    elif delay_days <= 5:
        return '1_1-5_days'
    elif delay_days <= 10:
        return '2_6-10_days'
    elif delay_days <= 20:
        return '3_11-20_days'
    else:
        return '4_21+_days'

def calculate_correlation(x, y):
    """Calcula correlação de Pearson com p-value"""
    # Remover NaNs
    mask = ~(np.isnan(x) | np.isnan(y))
    x_clean = x[mask]
    y_clean = y[mask]
    
    if len(x_clean) < 2:
        return None, None
    
    r, p_value = pearsonr(x_clean, y_clean)
    return r, p_value

def is_critical_order(row):
    """Pedido crítico: atraso >15 dias E NPS <3"""
    return (row['delay_days'] > SLA_CRITICAL) and (row['review_score'] < 3)

print(" Funções auxiliares carregadas")

## 2. Extração e Cálculo de Prazos 

In [None]:
# Query Base de Entregas
query_delivery_base = f"""
WITH delivery_base AS (
    SELECT 
        o.order_id,
        c.customer_state,
        s.seller_state,
        o.order_purchase_timestamp,
        o.order_delivered_customer_date,
        o.order_estimated_delivery_date,
        oi.freight_value,
        r.review_score
    FROM `{PROJECT_ID}.{DATASET_ID}.orders` o
    JOIN `{PROJECT_ID}.{DATASET_ID}.customers` c ON o.customer_id = c.customer_id
    JOIN `{PROJECT_ID}.{DATASET_ID}.order_items` oi ON o.order_id = oi.order_id
    JOIN `{PROJECT_ID}.{DATASET_ID}.sellers` s ON oi.seller_id = s.seller_id
    LEFT JOIN `{PROJECT_ID}.{DATASET_ID}.order_reviews` r ON o.order_id = r.order_id
    WHERE o.order_status = 'delivered'
    AND o.order_delivered_customer_date IS NOT NULL
    AND o.order_estimated_delivery_date IS NOT NULL
)
SELECT
    order_id,
    customer_state,
    seller_state,
    order_purchase_timestamp,
    order_delivered_customer_date,
    order_estimated_delivery_date,
    freight_value,
    review_score,
    DATE_DIFF(DATE(order_delivered_customer_date), DATE(order_purchase_timestamp), DAY) as delivery_days,
    DATE_DIFF(DATE(order_estimated_delivery_date), DATE(order_purchase_timestamp), DAY) as estimated_days,
    DATE_DIFF(DATE(order_delivered_customer_date), DATE(order_estimated_delivery_date), DAY) as delay_days
FROM delivery_base
WHERE DATE_DIFF(DATE(order_delivered_customer_date), DATE(order_purchase_timestamp), DAY) BETWEEN 0 AND 60
ORDER BY delay_days DESC
"""

df_delivery = query_bigquery(query_delivery_base)

# Calcular métricas adicionais
df_delivery['is_delayed'] = df_delivery['delay_days'] > 0
df_delivery['delay_bucket'] = df_delivery['delay_days'].apply(get_delay_bucket)
df_delivery['is_critical'] = df_delivery.apply(is_critical_order, axis=1)

print(f" {len(df_delivery):,} entregas carregadas")
print(f" Período: {df_delivery['order_purchase_timestamp'].min()} a {df_delivery['order_purchase_timestamp'].max()}")
print(f" Estados clientes: {df_delivery['customer_state'].nunique()}")
print(f" Estados vendedores: {df_delivery['seller_state'].nunique()}")

In [None]:
# Estatísticas Descritivas
print("="*70)
print("ESTATÍSTICAS DESCRITIVAS - BASE DE ENTREGAS")
print("="*70)

print(f"\n MÉTRICAS GLOBAIS:")
print(f"  • Total de pedidos entregues: {len(df_delivery):,}")
print(f"  • Pedidos atrasados: {df_delivery['is_delayed'].sum():,} ({df_delivery['is_delayed'].mean()*100:.1f}%)")
print(f"  • Pedidos críticos (atraso >{SLA_CRITICAL}d + NPS<3): {df_delivery['is_critical'].sum():,} ({df_delivery['is_critical'].mean()*100:.1f}%)")

print(f"\n PRAZOS DE ENTREGA (dias):")
print(f"  • Delivery days - Média: {df_delivery['delivery_days'].mean():.1f}, Mediana: {df_delivery['delivery_days'].median():.1f}")
print(f"  • Delivery days - P90: {df_delivery['delivery_days'].quantile(0.90):.1f}, P95: {df_delivery['delivery_days'].quantile(0.95):.1f}")
print(f"  • Estimated days - Média: {df_delivery['estimated_days'].mean():.1f}, Mediana: {df_delivery['estimated_days'].median():.1f}")

print(f"\n  ATRASOS (dias):")
delayed_orders = df_delivery[df_delivery['is_delayed']]
if len(delayed_orders) > 0:
    print(f"  • Atraso médio (quando ocorre): {delayed_orders['delay_days'].mean():.1f} dias")
    print(f"  • Atraso máximo: {delayed_orders['delay_days'].max():.0f} dias")
    print(f"  • Atraso >15 dias: {(delayed_orders['delay_days'] > 15).sum():,} pedidos")

print(f"\n AVALIAÇÕES:")
print(f"  • NPS médio geral: {df_delivery['review_score'].mean():.2f}")
print(f"  • Pedidos com review: {df_delivery['review_score'].notna().sum():,}")

print(f"\n FRETE:")
print(f"  • Frete médio: R$ {df_delivery['freight_value'].mean():.2f}")
print(f"  • Frete mediano: R$ {df_delivery['freight_value'].median():.2f}")

## 3. SLA Compliance e Distribuição

In [None]:
# Visualização Overview SLA
plt.close('all')
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Gráfico 1: Histograma delivery_days
axes[0, 0].hist(df_delivery['delivery_days'], bins=30, edgecolor='black', alpha=0.7, color='blue')
axes[0, 0].axvline(df_delivery['delivery_days'].mean(), color='red', linestyle='--', label=f'Média: {df_delivery["delivery_days"].mean():.1f}d')
axes[0, 0].axvline(df_delivery['estimated_days'].mean(), color='orange', linestyle='--', label=f'Estimado: {df_delivery["estimated_days"].mean():.1f}d')
axes[0, 0].set_xlabel('Dias para Entrega')
axes[0, 0].set_ylabel('Frequência')
axes[0, 0].set_title('Distribuição de Dias para Entrega')
axes[0, 0].legend()

# Gráfico 2: Boxplot delivery_days por is_delayed
sns.boxplot(data=df_delivery, x='is_delayed', y='delivery_days', ax=axes[0, 1], palette={False: 'green', True: 'red'})
axes[0, 1].set_xlabel('Atrasado')
axes[0, 1].set_ylabel('Dias para Entrega')
axes[0, 1].set_title('Distribuição de Dias para Entrega: On-time vs Atrasado')
axes[0, 1].set_xticklabels(['No Prazo', 'Atrasado'])

# Gráfico 3: Pie chart on_time vs delayed
delayed_pct = df_delivery['is_delayed'].mean() * 100
on_time_pct = 100 - delayed_pct
sizes = [on_time_pct, delayed_pct]
colors = ['green', 'red']
axes[1, 0].pie(sizes, labels=['No Prazo', 'Atrasado'], colors=colors, autopct='%1.1f%%', startangle=90)
axes[1, 0].set_title('Distribuição: Entregas no Prazo vs Atrasadas')

# Gráfico 4: Bar chart distribuição delay_days por bucket
delay_bucket_counts = df_delivery['delay_bucket'].value_counts().sort_index()
bucket_labels = ['No Prazo', '1-5 dias', '6-10 dias', '11-20 dias', '21+ dias']
colors = ['green', 'yellow', 'orange', 'red', 'darkred']
bars = axes[1, 1].bar(range(len(delay_bucket_counts)), delay_bucket_counts.values, color=colors, alpha=0.7)
axes[1, 1].set_xlabel('Faixa de Atraso')
axes[1, 1].set_ylabel('Número de Pedidos')
axes[1, 1].set_title('Distribuição de Pedidos por Faixa de Atraso')
axes[1, 1].set_xticks(range(len(delay_bucket_counts)))
axes[1, 1].set_xticklabels(bucket_labels, rotation=45)

# Adicionar valores nas barras
for bar, count in zip(bars, delay_bucket_counts.values):
    axes[1, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10, 
                   f'{count:,}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
save_plot(fig, 'delivery_sla_overview.png')
plt.show()

## 4. Performance por Estado 

In [None]:
# Query Performance por Estado
query_state_performance = f"""
WITH state_stats AS (
    SELECT 
        c.customer_state,
        COUNT(*) as total_orders,
        AVG(DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_purchase_timestamp), DAY)) as avg_delivery_days,
        AVG(DATE_DIFF(DATE(o.order_estimated_delivery_date), DATE(o.order_purchase_timestamp), DAY)) as avg_estimated_days,
        AVG(DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_estimated_delivery_date), DAY)) as avg_delay_days,
        AVG(CAST(r.review_score AS FLOAT64)) as avg_review_score,
        AVG(oi.freight_value) as avg_freight_value,
        SUM(CASE WHEN DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_estimated_delivery_date), DAY) > 0 THEN 1 ELSE 0 END) as delayed_orders
    FROM `{PROJECT_ID}.{DATASET_ID}.orders` o
    JOIN `{PROJECT_ID}.{DATASET_ID}.customers` c ON o.customer_id = c.customer_id
    JOIN `{PROJECT_ID}.{DATASET_ID}.order_items` oi ON o.order_id = oi.order_id
    LEFT JOIN `{PROJECT_ID}.{DATASET_ID}.order_reviews` r ON o.order_id = r.order_id
    WHERE o.order_status = 'delivered'
    AND o.order_delivered_customer_date IS NOT NULL
    GROUP BY c.customer_state
    HAVING COUNT(*) >= 100
)
SELECT
    customer_state,
    total_orders,
    avg_delivery_days,
    avg_estimated_days,
    avg_delay_days,
    avg_review_score,
    avg_freight_value,
    (delayed_orders / total_orders) * 100 as delayed_pct,
    (1 - (delayed_orders / total_orders)) * 100 as on_time_pct
FROM state_stats
ORDER BY on_time_pct DESC
"""

df_state_performance = query_bigquery(query_state_performance)
print(f"✓ Performance calculada para {len(df_state_performance)} estados")
print("\n TOP 5 ESTADOS - MELHOR SLA:")
print(df_state_performance.head())
print("\n  TOP 5 ESTADOS - PIOR SLA:")
print(df_state_performance.tail())

In [None]:
# Visualização Performance por Estado
plt.close('all')
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Gráfico 1: Top 10 estados SLA compliance
top_states = df_state_performance.head(10).sort_values('on_time_pct', ascending=True)
bars1 = axes[0, 0].barh(range(len(top_states)), top_states['on_time_pct'], 
                       color='green', alpha=0.7, edgecolor='black')
axes[0, 0].set_yticks(range(len(top_states)))
axes[0, 0].set_yticklabels(top_states['customer_state'])
axes[0, 0].set_xlabel('Taxa de Entrega no Prazo (%)')
axes[0, 0].set_title('Top 10 Estados - Melhor SLA Compliance')
axes[0, 0].set_xlim(0, 100)

# Gráfico 2: Bottom 10 estados SLA compliance
bottom_states = df_state_performance.tail(10).sort_values('on_time_pct', ascending=False)
bars2 = axes[0, 1].barh(range(len(bottom_states)), bottom_states['on_time_pct'], 
                       color='red', alpha=0.7, edgecolor='black')
axes[0, 1].set_yticks(range(len(bottom_states)))
axes[0, 1].set_yticklabels(bottom_states['customer_state'])
axes[0, 1].set_xlabel('Taxa de Entrega no Prazo (%)')
axes[0, 1].set_title('Bottom 10 Estados - Pior SLA Compliance')
axes[0, 1].set_xlim(0, 100)

# Gráfico 3: Mapa de calor avg_delivery_days por estado
state_delivery_pivot = df_state_performance.set_index('customer_state')['avg_delivery_days'].sort_values()
im = axes[1, 0].imshow([state_delivery_pivot.values], cmap='RdYlGn_r', aspect='auto')
axes[1, 0].set_yticks([0])
axes[1, 0].set_yticklabels([''])
axes[1, 0].set_xticks(range(len(state_delivery_pivot)))
axes[1, 0].set_xticklabels(state_delivery_pivot.index, rotation=90)
axes[1, 0].set_title('Dias Médios de Entrega por Estado')
plt.colorbar(im, ax=axes[1, 0], label='Dias')

# Gráfico 4: Scatter volume pedidos vs delay_rate
scatter = axes[1, 1].scatter(df_state_performance['total_orders'], 
                            df_state_performance['delayed_pct'], 
                            s=df_state_performance['avg_delivery_days']*10, 
                            alpha=0.6, 
                            c=df_state_performance['avg_review_score'], 
                            cmap='RdYlGn')
axes[1, 1].set_xlabel('Volume de Pedidos')
axes[1, 1].set_ylabel('Taxa de Atraso (%)')
axes[1, 1].set_title('Volume vs Taxa de Atraso (tamanho=dias, cor=NPS)')
plt.colorbar(scatter, ax=axes[1, 1], label='NPS Médio')

# Adicionar labels para estados extremos
for i, row in df_state_performance.nlargest(3, 'delayed_pct').iterrows():
    axes[1, 1].annotate(row['customer_state'], 
                       (row['total_orders'], row['delayed_pct']),
                       xytext=(5, 5), textcoords='offset points', fontsize=9)

plt.tight_layout()
save_plot(fig, 'delivery_performance_by_state.png')
plt.show()

## 5. Análise de Rotas 

In [None]:
# Query Performance por Rota
query_route_performance = f"""
WITH route_stats AS (
    SELECT 
        s.seller_state,
        c.customer_state,
        COUNT(*) as total_orders,
        AVG(DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_purchase_timestamp), DAY)) as avg_delivery_days,
        AVG(DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_estimated_delivery_date), DAY)) as avg_delay_days,
        AVG(oi.freight_value) as avg_freight_value,
        SUM(CASE WHEN DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_estimated_delivery_date), DAY) > 0 THEN 1 ELSE 0 END) as delayed_orders
    FROM `{PROJECT_ID}.{DATASET_ID}.orders` o
    JOIN `{PROJECT_ID}.{DATASET_ID}.customers` c ON o.customer_id = c.customer_id
    JOIN `{PROJECT_ID}.{DATASET_ID}.order_items` oi ON o.order_id = oi.order_id
    JOIN `{PROJECT_ID}.{DATASET_ID}.sellers` s ON oi.seller_id = s.seller_id
    WHERE o.order_status = 'delivered'
    AND o.order_delivered_customer_date IS NOT NULL
    GROUP BY s.seller_state, c.customer_state
    HAVING COUNT(*) >= 20
)
SELECT
    seller_state,
    customer_state,
    total_orders,
    avg_delivery_days,
    avg_delay_days,
    avg_freight_value,
    (delayed_orders / total_orders) * 100 as delayed_pct,
    total_orders * (delayed_orders / total_orders) as priority_score
FROM route_stats
ORDER BY priority_score DESC
"""

df_route_performance = query_bigquery(query_route_performance)
print(f"✓ Performance calculada para {len(df_route_performance)} rotas")
print("\n TOP 5 ROTAS MAIS PROBLEMÁTICAS:")
print(df_route_performance.head())

In [None]:
# Visualização Análise de Rotas
plt.close('all')
fig, axes = plt.subplots(2, 1, figsize=(16, 14))

# Gráfico 1: Heatmap rotas (seller_state x customer_state) colorido por delay_rate
# Preparar pivot table
route_pivot = df_route_performance.pivot_table(
    index='seller_state',
    columns='customer_state',
    values='delayed_pct',
    aggfunc='mean'
).fillna(0)

im = axes[0].imshow(route_pivot, cmap='RdYlGn_r', aspect='auto', vmin=0, vmax=50)
axes[0].set_xlabel('Estado Cliente')
axes[0].set_ylabel('Estado Vendedor')
axes[0].set_title('Heatmap: Taxa de Atraso por Rota (Vendedor → Cliente)')
axes[0].set_xticks(range(len(route_pivot.columns)))
axes[0].set_xticklabels(route_pivot.columns, rotation=90)
axes[0].set_yticks(range(len(route_pivot.index)))
axes[0].set_yticklabels(route_pivot.index)
plt.colorbar(im, ax=axes[0], label='Taxa de Atraso (%)')

# Anotar valores no heatmap
for i in range(len(route_pivot.index)):
    for j in range(len(route_pivot.columns)):
        value = route_pivot.iloc[i, j]
        if value > 0:
            axes[0].text(j, i, f'{value:.0f}%', 
                       ha='center', va='center', 
                       color='white' if value > 25 else 'black',
                       fontsize=8)

# Gráfico 2: Top 10 rotas problemáticas
top_problematic_routes = df_route_performance.head(10).sort_values('priority_score', ascending=True)
top_problematic_routes['route'] = top_problematic_routes['seller_state'] + ' → ' + top_problematic_routes['customer_state']

bars = axes[1].barh(range(len(top_problematic_routes)), top_problematic_routes['priority_score'], 
                   color='red', alpha=0.7, edgecolor='black')
axes[1].set_yticks(range(len(top_problematic_routes)))
axes[1].set_yticklabels(top_problematic_routes['route'])
axes[1].set_xlabel('Priority Score (Volume × Taxa Atraso)')
axes[1].set_title('Top 10 Rotas Mais Problemáticas - Maior Impacto')

# Adicionar informações adicionais nas barras
for i, (bar, row) in enumerate(zip(bars, top_problematic_routes.iterrows())):
    route_data = row[1]
    axes[1].text(bar.get_width() + 5, bar.get_y() + bar.get_height()/2, 
                f'{route_data["delayed_pct"]:.1f}% atraso | {route_data["total_orders"]} pedidos', 
                va='center', fontsize=9)

plt.tight_layout()
save_plot(fig, 'delivery_routes_analysis.png')
plt.show()

## 6. Correlação Atraso vs NPS 

In [None]:
# Preparar dados para análise NPS
df_nps_analysis = df_delivery[df_delivery['review_score'].notna()].copy()

# Calcular NPS por bucket de atraso
nps_by_delay = df_nps_analysis.groupby('delay_bucket').agg({
    'review_score': ['mean', 'count'],
    'delay_days': 'mean'
}).round(2)

nps_by_delay.columns = ['avg_review_score', 'total_orders', 'avg_delay_days']
nps_by_delay = nps_by_delay.reset_index()

# Ordenar por bucket
bucket_order = ['0_on_time', '1_1-5_days', '2_6-10_days', '3_11-20_days', '4_21+_days']
nps_by_delay['delay_bucket'] = pd.Categorical(nps_by_delay['delay_bucket'], categories=bucket_order, ordered=True)
nps_by_delay = nps_by_delay.sort_values('delay_bucket')

# Calcular correlação
correlation_r, correlation_p = calculate_correlation(df_nps_analysis['delay_days'], df_nps_analysis['review_score'])

print(" ANÁLISE NPS POR ATRASO:")
print(nps_by_delay)
print(f"\n CORRELAÇÃO ATRASO vs NPS:")
print(f"  • Pearson r: {correlation_r:.3f}")
print(f"  • p-value: {correlation_p:.5f}")
print(f"  • Significância: {'SIM' if correlation_p < 0.05 else 'NÃO'}")

# Calcular regressão linear
if correlation_r is not None:
    x = df_nps_analysis['delay_days']
    y = df_nps_analysis['review_score']
    mask = ~(np.isnan(x) | np.isnan(y))
    slope, intercept, r_value, p_value, std_err = stats.linregress(x[mask], y[mask])
    print(f"  • Regressão: NPS = {intercept:.2f} - {abs(slope):.3f} * dias_atraso")
    print(f"  • R²: {r_value**2:.3f}")

In [None]:
# Visualização Correlação Atraso vs NPS
plt.close('all')
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Gráfico 1: Line plot NPS médio por delay_bucket
bucket_labels_clean = ['No Prazo', '1-5 dias', '6-10 dias', '11-20 dias', '21+ dias']
axes[0, 0].plot(range(len(nps_by_delay)), nps_by_delay['avg_review_score'], 
               marker='o', linewidth=3, markersize=8, color='red')
axes[0, 0].set_xticks(range(len(nps_by_delay)))
axes[0, 0].set_xticklabels(bucket_labels_clean, rotation=45)
axes[0, 0].set_ylabel('NPS Médio')
axes[0, 0].set_title('NPS Médio por Faixa de Atraso')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_ylim(0, 5)

# Destacar ponto crítico
critical_idx = 3  # 11-20 dias
axes[0, 0].axvline(critical_idx, color='darkred', linestyle='--', alpha=0.7, label='Ponto Crítico')
axes[0, 0].legend()

# Gráfico 2: Scatter delay_days vs review_score com regressão
sample_size = min(1000, len(df_nps_analysis))
df_sample = df_nps_analysis.sample(sample_size, random_state=42)

scatter = axes[0, 1].scatter(df_sample['delay_days'], df_sample['review_score'], 
                            alpha=0.6, s=20, c=df_sample['delay_days'], cmap='RdYlGn_r')
axes[0, 1].set_xlabel('Dias de Atraso')
axes[0, 1].set_ylabel('Review Score')
axes[0, 1].set_title(f'Correlação: Atraso vs NPS (r = {correlation_r:.3f})')

# Adicionar linha de regressão
if correlation_r is not None:
    x_range = np.linspace(df_sample['delay_days'].min(), df_sample['delay_days'].max(), 100)
    y_pred = slope * x_range + intercept
    axes[0, 1].plot(x_range, y_pred, '--', color='black', linewidth=2, label='Linha de Regressão')
    axes[0, 1].legend()

# Gráfico 3: Violin plot review_score por delay_bucket
sns.violinplot(data=df_nps_analysis, x='delay_bucket', y='review_score', 
               ax=axes[1, 0], palette=['green', 'yellow', 'orange', 'red', 'darkred'])
axes[1, 0].set_xlabel('Faixa de Atraso')
axes[1, 0].set_ylabel('Review Score')
axes[1, 0].set_title('Distribuição de NPS por Faixa de Atraso')
axes[1, 0].set_xticklabels(bucket_labels_clean, rotation=45)

# Gráfico 4: Bar chart % pedidos por delay_bucket
delay_bucket_pct = df_delivery['delay_bucket'].value_counts(normalize=True).sort_index() * 100
colors = ['green', 'yellow', 'orange', 'red', 'darkred']
bars = axes[1, 1].bar(range(len(delay_bucket_pct)), delay_bucket_pct.values, 
                     color=colors, alpha=0.7, edgecolor='black')
axes[1, 1].set_xlabel('Faixa de Atraso')
axes[1, 1].set_ylabel('% de Pedidos')
axes[1, 1].set_title('Distribuição de Pedidos por Faixa de Atraso')
axes[1, 1].set_xticks(range(len(delay_bucket_pct)))
axes[1, 1].set_xticklabels(bucket_labels_clean, rotation=45)

# Adicionar valores nas barras
for bar, pct in zip(bars, delay_bucket_pct.values):
    axes[1, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
                   f'{pct:.1f}%', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
save_plot(fig, 'delivery_nps_correlation.png')
plt.show()

## 7. Análise de Frete 

In [None]:
# Análise de Correlação Frete vs Entrega
correlation_freight_delivery, p_freight_delivery = calculate_correlation(
    df_delivery['freight_value'], 
    df_delivery['delivery_days']
)

correlation_freight_delay, p_freight_delay = calculate_correlation(
    df_delivery['freight_value'], 
    df_delivery['delay_days']
)

print(" ANÁLISE DE CORRELAÇÃO FRETE:")
print(f"  • Frete vs Dias Entrega: r = {correlation_freight_delivery:.3f}, p = {p_freight_delivery:.5f}")
print(f"  • Frete vs Dias Atraso: r = {correlation_freight_delay:.3f}, p = {p_freight_delay:.5f}")
print(f"  • Conclusão: {'Correlação significativa' if p_freight_delivery < 0.05 else 'Sem correlação significativa'}")

In [None]:
# Visualização Análise de Frete
plt.close('all')
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Gráfico 1: Scatter freight_value vs delivery_days
sample_size_freight = min(2000, len(df_delivery))
df_freight_sample = df_delivery.sample(sample_size_freight, random_state=42)

scatter1 = axes[0].scatter(df_freight_sample['freight_value'], 
                          df_freight_sample['delivery_days'], 
                          c=df_freight_sample['delay_days'], 
                          cmap='RdYlGn_r', alpha=0.6, s=30)
axes[0].set_xlabel('Valor do Frete (R$)')
axes[0].set_ylabel('Dias para Entrega')
axes[0].set_title(f'Frete vs Dias Entrega (r = {correlation_freight_delivery:.3f})')
plt.colorbar(scatter1, ax=axes[0], label='Dias de Atraso')

# Gráfico 2: Box plot freight_value por delay_bucket
sns.boxplot(data=df_delivery, x='delay_bucket', y='freight_value', 
           ax=axes[1], palette=['green', 'yellow', 'orange', 'red', 'darkred'])
axes[1].set_xlabel('Faixa de Atraso')
axes[1].set_ylabel('Valor do Frete (R$)')
axes[1].set_title('Distribuição de Frete por Faixa de Atraso')
axes[1].set_xticklabels(bucket_labels_clean, rotation=45)

plt.tight_layout()
save_plot(fig, 'delivery_freight_analysis.png')
plt.show()

## 8. Insights e Recomendações {#8-insights}

In [None]:
# Sumário Executivo Final
print("\n" + "="*80)
print(" RESUMO EXECUTIVO - ANÁLISE DE PERFORMANCE LOGÍSTICA")
print("="*80)

# Coletar métricas finais
total_orders = len(df_delivery)
sla_compliance = (1 - df_delivery['is_delayed'].mean()) * 100
avg_delivery_days = df_delivery['delivery_days'].mean()
delay_rate = df_delivery['is_delayed'].mean() * 100
critical_orders_pct = df_delivery['is_critical'].mean() * 100

# NPS comparison
on_time_nps = df_delivery[~df_delivery['is_delayed']]['review_score'].mean()
delayed_nps = df_delivery[df_delivery['is_delayed']]['review_score'].mean()
nps_drop = on_time_nps - delayed_nps

# Top/Bottom states
best_state = df_state_performance.iloc[0]
worst_state = df_state_performance.iloc[-1]

# Top problematic route
worst_route = df_route_performance.iloc[0]

print(f"\n MÉTRICAS GLOBAIS:")
print(f"   • Total pedidos analisados: {total_orders:,}")
print(f"   • SLA Compliance: {sla_compliance:.1f}%")
print(f"   • Taxa de atraso: {delay_rate:.1f}%")
print(f"   • Tempo médio entrega: {avg_delivery_days:.1f} dias")
print(f"   • Pedidos críticos: {critical_orders_pct:.1f}%")

print(f"\n IMPACTO NPS:")
print(f"   • NPS entregas no prazo: {on_time_nps:.2f}")
print(f"   • NPS entregas atrasadas: {delayed_nps:.2f}")
print(f"   • Queda de NPS por atraso: {nps_drop:.2f} pontos")
print(f"   • Correlação atraso-NPS: r = {correlation_r:.3f} (p = {correlation_p:.5f})")

print(f"\n PERFORMANCE GEOGRÁFICA:")
print(f"   • Melhor estado: {best_state['customer_state']} ({best_state['on_time_pct']:.1f}% no prazo)")
print(f"   • Pior estado: {worst_state['customer_state']} ({worst_state['on_time_pct']:.1f}% no prazo)")
print(f"   • Diferença: {best_state['on_time_pct'] - worst_state['on_time_pct']:.1f}%")

print(f"\n ROTAS CRÍTICAS:")
print(f"   • Rota mais problemática: {worst_route['seller_state']} → {worst_route['customer_state']}")
print(f"   • Taxa de atraso: {worst_route['delayed_pct']:.1f}%")
print(f"   • Volume: {worst_route['total_orders']} pedidos")
print(f"   • Priority score: {worst_route['priority_score']:.1f}")

print(f"\n ANÁLISE DE FRETE:")
print(f"   • Frete médio: R$ {df_delivery['freight_value'].mean():.2f}")
print(f"   • Correlação frete-prazo: r = {correlation_freight_delivery:.3f}")
print(f"   • Insight: {'Frete mais caro NÃO garante entrega mais rápida' if abs(correlation_freight_delivery) < 0.3 else 'Possível relação entre frete e prazo'}")

print(f"\n" + "="*80)
print(" RECOMENDAÇÕES ESTRATÉGICAS")
print("="*80)

print(f"\n PRIORIDADE 1: OTIMIZAR ROTAS CRÍTICAS")
print(f"   • Foco: {worst_route['seller_state']} → {worst_route['customer_state']} ({worst_route['delayed_pct']:.1f}% atraso)")
print(f"   • Ação: Parcerias logísticas regionais, hubs intermediários")
print(f"   • Meta: Reduzir atraso para <15% em 90 dias")

print(f"\n PRIORIDADE 2: MELHORAR ESTADOS COM BAIXA PERFORMANCE")
print(f"   • Estados críticos: {worst_state['customer_state']} ({worst_state['on_time_pct']:.1f}% compliance)")
print(f"   • Ação: Investir em infraestrutura logística regional")
print(f"   • Meta: Aumentar SLA compliance para >80%")

print(f"\n PRIORIDADE 3: REVISAR ESTIMATIVAS DE ENTREGA")
print(f"   • Problema: Prazo estimado muito conservador ({df_delivery['estimated_days'].mean():.1f} vs {df_delivery['delivery_days'].mean():.1f} dias real)")
print(f"   • Ação: Recalcular estimativas baseado em performance real")
print(f"   • Meta: Reduzir estimativa média para {df_delivery['delivery_days'].mean() + 3:.1f} dias")

print(f"\n PRIORIDADE 4: SISTEMA DE ALERTA PARA PEDIDOS CRÍTICOS")
print(f"   • Critério: Atraso > {SLA_CRITICAL} dias")
print(f"   • Ação: Alerta automático D+10, ação proativa D+{SLA_CRITICAL}")
print(f"   • Compensação: Cupom 15% ou frete grátis próxima compra")

print(f"\n PRIORIDADE 5: MONITORAMENTO CONTÍNUO")
print(f"   • Dashboard: SLA compliance por estado/rota")
print(f"   • Métrica-chave: % pedidos críticos (atraso > {SLA_CRITICAL}d + NPS<3)")
print(f"   • Review: Análise mensal de performance")

print(f"\n" + "="*80)
print(" ANÁLISE DE PERFORMANCE LOGÍSTICA CONCLUÍDA")
print("="*80)