# Análise de Lifetime Value (LTV) - Olist E-Commerce

**Objetivo:** Calcular e analisar o valor vitalício dos clientes por diferentes dimensões para identificar clientes de alto valor e oportunidades de retenção.

---

## Índice

1. [Setup e Configuração](#1-setup)
2. [LTV por Cliente](#2-cliente)
3. [LTV por Estado](#3-estado)
4. [LTV por Cohort](#4-cohort)
5. [LTV por Segmento](#5-segmento)
6. [Análise Pareto](#6-pareto)
7. [LTV Forecast](#7-forecast)
8. [Insights e Conclusões](#8-insights)

## 1. Setup e Configuração

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from datetime import datetime, timedelta
import warnings
from dotenv import load_dotenv
import os

warnings.filterwarnings('ignore')

# Configuração de visualização
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Configuração do pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Carregar variáveis de ambiente
load_dotenv()

# Configuração BigQuery
PROJECT_ID = os.getenv('GCP_PROJECT_ID')
DATASET_ID = os.getenv('GCP_DATASET_ID', 'olist_ecommerce')

# Cliente BigQuery
client = bigquery.Client(project=PROJECT_ID)

# Criar diretório para imagens
os.makedirs('../docs/images', exist_ok=True)

print(f"Setup completo - Projeto: {PROJECT_ID}, Dataset: {DATASET_ID}")

In [None]:
def query_bigquery(query: str) -> pd.DataFrame:
    """Helper para executar queries no BigQuery"""
    return client.query(query).to_dataframe()

def save_plot(fig, filename):
    """Salvar gráfico com tratamento de erro"""
    try:
        fig.savefig(f'../docs/images/{filename}', dpi=300, bbox_inches='tight')
        print(f"✓ Gráfico salvo: docs/images/{filename}")
    except Exception as e:
        print(f" Erro ao salvar gráfico: {e}")

## 2. LTV por Cliente

In [None]:
# Query LTV por cliente
query_ltv_cliente = f"""
WITH customer_metrics AS (
    SELECT 
        c.customer_unique_id,
        SUM(p.payment_value) as lifetime_value,
        COUNT(DISTINCT o.order_id) as total_orders,
        AVG(p.payment_value) as avg_order_value,
        DATE_DIFF(MAX(o.order_purchase_timestamp), MIN(o.order_purchase_timestamp), DAY) as customer_lifetime_days,
        AVG(r.review_score) as avg_review_score,
        MIN(o.order_purchase_timestamp) as first_purchase_date,
        MAX(o.order_purchase_timestamp) as last_purchase_date
    FROM `{PROJECT_ID}.{DATASET_ID}.orders` o
    JOIN `{PROJECT_ID}.{DATASET_ID}.customers` c ON o.customer_id = c.customer_id
    JOIN `{PROJECT_ID}.{DATASET_ID}.payments` p ON o.order_id = p.order_id
    LEFT JOIN `{PROJECT_ID}.{DATASET_ID}.reviews` r ON o.order_id = r.order_id
    WHERE o.order_status = 'delivered'
    GROUP BY c.customer_unique_id
)
SELECT *
FROM customer_metrics
WHERE lifetime_value > 0
ORDER BY lifetime_value DESC
"""

df_ltv_cliente = query_bigquery(query_ltv_cliente)
print(f"{len(df_ltv_cliente):,} clientes carregados com LTV calculado")

In [None]:
# Análise distribuição LTV
plt.close('all')
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Histograma
axes[0, 0].hist(df_ltv_cliente['lifetime_value'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Lifetime Value (R$)')
axes[0, 0].set_ylabel('Frequência')
axes[0, 0].set_title('Distribuição do LTV por Cliente')
axes[0, 0].axvline(df_ltv_cliente['lifetime_value'].mean(), color='red', linestyle='--', label='Média')
axes[0, 0].axvline(df_ltv_cliente['lifetime_value'].median(), color='green', linestyle='--', label='Mediana')
axes[0, 0].legend()

# Boxplot
axes[0, 1].boxplot(df_ltv_cliente['lifetime_value'])
axes[0, 1].set_ylabel('LTV (R$)')
axes[0, 1].set_title('Boxplot - Distribuição LTV')

# Escala Log
axes[1, 0].hist(np.log1p(df_ltv_cliente['lifetime_value']), bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Log(LTV + 1)')
axes[1, 0].set_ylabel('Frequência')
axes[1, 0].set_title('Distribuição LTV (Escala Log)')

# Percentis
percentis = [10, 25, 50, 75, 90, 95, 99]
percentis_values = [df_ltv_cliente['lifetime_value'].quantile(p/100) for p in percentis]
axes[1, 1].bar([f'P{p}' for p in percentis], percentis_values, color='orange', alpha=0.7)
axes[1, 1].set_xlabel('Percentil')
axes[1, 1].set_ylabel('LTV (R$)')
axes[1, 1].set_title('LTV por Percentil')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
save_plot(fig, 'ltv_distribution.png')
plt.show()

In [None]:
# Estatísticas descritivas e Top 100 clientes
print("="*60)
print("ESTATÍSTICAS DESCRITIVAS - LTV POR CLIENTE")
print("="*60)

print(f"\nMÉTRICAS GERAIS:")
print(f"  • Clientes únicos: {len(df_ltv_cliente):,}")
print(f"  • LTV Médio: R$ {df_ltv_cliente['lifetime_value'].mean():.2f}")
print(f"  • LTV Mediano: R$ {df_ltv_cliente['lifetime_value'].median():.2f}")
print(f"  • Desvio Padrão: R$ {df_ltv_cliente['lifetime_value'].std():.2f}")

print(f"\nPERCENTIS LTV:")
for p, val in zip(percentis, percentis_values):
    print(f"  • P{p}: R$ {val:.2f}")

print(f"\nMÉTRICAS ADICIONAIS:")
print(f"  • Ticket Médio: R$ {df_ltv_cliente['avg_order_value'].mean():.2f}")
print(f"  • Pedidos Médios: {df_ltv_cliente['total_orders'].mean():.1f}")
print(f"  • Lifetime Médio: {df_ltv_cliente['customer_lifetime_days'].mean():.1f} dias")

# Top 100 clientes
top_100 = df_ltv_cliente.head(100).copy()
top_100['tier'] = pd.cut(top_100['lifetime_value'], 
                        bins=[0, 100, 300, 1000, float('inf')],
                        labels=['Standard', 'Premium', 'VIP', 'Champion'])

print(f"\nTOP 100 CLIENTES - DISTRIBUIÇÃO POR TIER:")
print(top_100['tier'].value_counts().sort_index())

print(f"\nCLIENTES CHAMPION (LTV > R$ 1.000):")
champions = top_100[top_100['tier'] == 'Champion']
print(f"  • Quantidade: {len(champions)}")
print(f"  • LTV Médio: R$ {champions['lifetime_value'].mean():,.2f}")
print(f"  • Pedidos Médios: {champions['total_orders'].mean():.1f}")

## 3. LTV por Estado 

In [None]:
# Query LTV por estado
query_ltv_estado = f"""
WITH customer_ltv AS (
    SELECT 
        c.customer_unique_id,
        c.customer_state,
        SUM(p.payment_value) as lifetime_value,
        COUNT(DISTINCT o.order_id) as total_orders
    FROM `{PROJECT_ID}.{DATASET_ID}.orders` o
    JOIN `{PROJECT_ID}.{DATASET_ID}.customers` c ON o.customer_id = c.customer_id
    JOIN `{PROJECT_ID}.{DATASET_ID}.payments` p ON o.order_id = p.order_id
    WHERE o.order_status = 'delivered'
    GROUP BY c.customer_unique_id, c.customer_state
),
state_metrics AS (
    SELECT
        customer_state,
        COUNT(customer_unique_id) as total_customers,
        SUM(lifetime_value) as total_gmv,
        AVG(lifetime_value) as avg_ltv,
        APPROX_QUANTILES(lifetime_value, 100)[OFFSET(50)] as median_ltv,
        APPROX_QUANTILES(lifetime_value, 100)[OFFSET(90)] as p90_ltv,
        SUM(CASE WHEN total_orders >= 2 THEN 1 ELSE 0 END) / COUNT(*) as repeat_rate
    FROM customer_ltv
    GROUP BY customer_state
)
SELECT 
    *,
    total_gmv / SUM(total_gmv) OVER() as gmv_share
FROM state_metrics
ORDER BY total_gmv DESC
"""

df_ltv_estado = query_bigquery(query_ltv_estado)
print(f"✓ LTV calculado para {len(df_ltv_estado)} estados")

In [None]:
# Visualização LTV por estado
plt.close('all')
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

top_10 = df_ltv_estado.head(10)

# LTV Médio
axes[0, 0].barh(top_10['customer_state'], top_10['avg_ltv'], edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('LTV Médio (R$)')
axes[0, 0].set_title('Top 10 Estados - LTV Médio')
axes[0, 0].invert_yaxis()

# GMV Total
axes[0, 1].barh(top_10['customer_state'], top_10['total_gmv'], edgecolor='black', alpha=0.7, color='green')
axes[0, 1].set_xlabel('GMV Total (R$)')
axes[0, 1].set_title('Top 10 Estados - GMV Total')
axes[0, 1].invert_yaxis()

# Total Clientes
axes[1, 0].barh(top_10['customer_state'], top_10['total_customers'], edgecolor='black', alpha=0.7, color='orange')
axes[1, 0].set_xlabel('Número de Clientes')
axes[1, 0].set_title('Top 10 Estados - Total Clientes')
axes[1, 0].invert_yaxis()

# GMV Share
axes[1, 1].barh(top_10['customer_state'], top_10['gmv_share']*100, edgecolor='black', alpha=0.7, color='red')
axes[1, 1].set_xlabel('Participação no GMV (%)')
axes[1, 1].set_title('Top 10 Estados - Share do GMV')
axes[1, 1].invert_yaxis()

plt.tight_layout()
save_plot(fig, 'ltv_by_state.png')
plt.show()

In [None]:
# Análise detalhada por estado
print("="*60)
print("ANÁLISE LTV POR ESTADO")
print("="*60)

print(f"\nTOP 5 ESTADOS POR LTV MÉDIO:")
top_ltv = df_ltv_estado.nlargest(5, 'avg_ltv')[['customer_state', 'avg_ltv', 'median_ltv', 'total_customers']]
for _, row in top_ltv.iterrows():
    print(f"  • {row['customer_state']}: R$ {row['avg_ltv']:.2f} (mediana: R$ {row['median_ltv']:.2f}) - {row['total_customers']} clientes")

print(f"\nTOP 5 ESTADOS POR GMV TOTAL:")
top_gmv = df_ltv_estado.nlargest(5, 'total_gmv')[['customer_state', 'total_gmv', 'gmv_share']]
for _, row in top_gmv.iterrows():
    print(f"  • {row['customer_state']}: R$ {row['total_gmv']:,.2f} ({row['gmv_share']*100:.1f}% do total)")

print(f"\n TAXA DE RECOMPRA POR ESTADO (TOP 5):")
top_repeat = df_ltv_estado.nlargest(5, 'repeat_rate')[['customer_state', 'repeat_rate', 'total_customers']]
for _, row in top_repeat.iterrows():
    print(f"  • {row['customer_state']}: {row['repeat_rate']*100:.1f}% ({row['total_customers']} clientes)")

print(f"\n CONCENTRAÇÃO GEOGRÁFICA:")
top_3_share = df_ltv_estado.head(3)['gmv_share'].sum() * 100
print(f"  • Top 3 estados: {top_3_share:.1f}% do GMV total")
print(f"  • Top 5 estados: {df_ltv_estado.head(5)['gmv_share'].sum()*100:.1f}% do GMV total")

## 4. LTV por Cohort 

In [None]:
# Query LTV por cohort
query_ltv_cohort = f"""
WITH first_purchases AS (
    SELECT 
        c.customer_unique_id,
        DATE_TRUNC(MIN(o.order_purchase_timestamp), MONTH) as cohort_month
    FROM `{PROJECT_ID}.{DATASET_ID}.orders` o
    JOIN `{PROJECT_ID}.{DATASET_ID}.customers` c ON o.customer_id = c.customer_id
    WHERE o.order_status = 'delivered'
    GROUP BY c.customer_unique_id
),
customer_ltv AS (
    SELECT 
        c.customer_unique_id,
        SUM(p.payment_value) as lifetime_value,
        COUNT(DISTINCT o.order_id) as total_orders
    FROM `{PROJECT_ID}.{DATASET_ID}.orders` o
    JOIN `{PROJECT_ID}.{DATASET_ID}.customers` c ON o.customer_id = c.customer_id
    JOIN `{PROJECT_ID}.{DATASET_ID}.payments` p ON o.order_id = p.order_id
    WHERE o.order_status = 'delivered'
    GROUP BY c.customer_unique_id
)
SELECT
    fp.cohort_month,
    COUNT(fp.customer_unique_id) as cohort_size,
    AVG(cl.lifetime_value) as avg_ltv,
    SUM(cl.lifetime_value) as total_revenue,
    AVG(cl.total_orders) as avg_orders
FROM first_purchases fp
JOIN customer_ltv cl ON fp.customer_unique_id = cl.customer_unique_id
GROUP BY fp.cohort_month
ORDER BY fp.cohort_month
"""

df_ltv_cohort = query_bigquery(query_ltv_cohort)
df_ltv_cohort['growth_rate'] = df_ltv_cohort['avg_ltv'].pct_change() * 100
print(f"✓ {len(df_ltv_cohort)} cohorts analisados")

In [None]:
# Visualização LTV por cohort
plt.close('all')
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Evolução LTV Médio
axes[0, 0].plot(df_ltv_cohort['cohort_month'], df_ltv_cohort['avg_ltv'], marker='o', linewidth=2)
axes[0, 0].set_xlabel('Cohort (Mês Primeira Compra)')
axes[0, 0].set_ylabel('LTV Médio (R$)')
axes[0, 0].set_title('Evolução do LTV Médio por Cohort')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

# Tamanho do Cohort
axes[0, 1].bar(df_ltv_cohort['cohort_month'], df_ltv_cohort['cohort_size'], alpha=0.7)
axes[0, 1].set_xlabel('Cohort (Mês Primeira Compra)')
axes[0, 1].set_ylabel('Número de Clientes')
axes[0, 1].set_title('Tamanho dos Cohorts')
axes[0, 1].tick_params(axis='x', rotation=45)

# Receita Total
axes[1, 0].bar(df_ltv_cohort['cohort_month'], df_ltv_cohort['total_revenue'], alpha=0.7, color='green')
axes[1, 0].set_xlabel('Cohort (Mês Primeira Compra)')
axes[1, 0].set_ylabel('Receita Total (R$)')
axes[1, 0].set_title('Receita Total por Cohort')
axes[1, 0].tick_params(axis='x', rotation=45)

# Growth Rate
axes[1, 1].plot(df_ltv_cohort['cohort_month'], df_ltv_cohort['growth_rate'], marker='o', linewidth=2, color='red')
axes[1, 1].set_xlabel('Cohort (Mês Primeira Compra)')
axes[1, 1].set_ylabel('Growth Rate (% MoM)')
axes[1, 1].set_title('Taxa de Crescimento LTV (Month over Month)')
axes[1, 1].axhline(0, color='black', linestyle='--', alpha=0.5)
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
save_plot(fig, 'ltv_by_cohort.png')
plt.show()

In [None]:
# Análise cohorts
print("="*60)
print("ANÁLISE LTV POR COHORT")
print("="*60)

print(f"\n COHORTS MAIS RECENTES (ÚLTIMOS 3 MESES):")
recent_cohorts = df_ltv_cohort.tail(3)
for _, row in recent_cohorts.iterrows():
    print(f"  • {row['cohort_month'].strftime('%Y-%m')}: {row['cohort_size']} clientes, LTV R$ {row['avg_ltv']:.2f}")

print(f"\n COHORTS DE MAIOR VALOR (TOP 3 LTV):")
top_cohorts = df_ltv_cohort.nlargest(3, 'avg_ltv')
for _, row in top_cohorts.iterrows():
    print(f"  • {row['cohort_month'].strftime('%Y-%m')}: LTV R$ {row['avg_ltv']:.2f}, {row['cohort_size']} clientes")

print(f"\n EVOLUÇÃO TEMPORAL:")
first_cohort = df_ltv_cohort.iloc[0]
last_cohort = df_ltv_cohort.iloc[-1]
growth_pct = ((last_cohort['avg_ltv'] - first_cohort['avg_ltv']) / first_cohort['avg_ltv']) * 100
print(f"  • Primeiro cohort ({first_cohort['cohort_month'].strftime('%Y-%m')}): R$ {first_cohort['avg_ltv']:.2f}")
print(f"  • Último cohort ({last_cohort['cohort_month'].strftime('%Y-%m')}): R$ {last_cohort['avg_ltv']:.2f}")
print(f"  • Crescimento: {growth_pct:+.1f}%")

print(f"\n COHORTS DE ALTA QUALIDADE (LTV > R$ 200):")
high_value_cohorts = df_ltv_cohort[df_ltv_cohort['avg_ltv'] > 200]
print(f"  • Quantidade: {len(high_value_cohorts)} cohorts")
print(f"  • LTV Médio desses: R$ {high_value_cohorts['avg_ltv'].mean():.2f}")

## 5. LTV por Segmento 

In [None]:
# Query LTV por segmento
query_ltv_segmento = f"""
WITH customer_segments AS (
    SELECT 
        c.customer_unique_id,
        COUNT(DISTINCT o.order_id) as total_orders,
        SUM(p.payment_value) as lifetime_value,
        AVG(p.payment_value) as avg_order_value,
        CASE 
            WHEN COUNT(DISTINCT o.order_id) = 1 THEN 'One-time'
            WHEN COUNT(DISTINCT o.order_id) = 2 THEN 'Repeat' 
            ELSE 'Loyal'
        END as segment
    FROM `{PROJECT_ID}.{DATASET_ID}.orders` o
    JOIN `{PROJECT_ID}.{DATASET_ID}.customers` c ON o.customer_id = c.customer_id
    JOIN `{PROJECT_ID}.{DATASET_ID}.payments` p ON o.order_id = p.order_id
    WHERE o.order_status = 'delivered'
    GROUP BY c.customer_unique_id
)
SELECT
    segment,
    COUNT(*) as customer_count,
    SUM(lifetime_value) as total_revenue,
    AVG(lifetime_value) as avg_ltv,
    AVG(total_orders) as avg_orders,
    AVG(avg_order_value) as avg_aov
FROM customer_segments
GROUP BY segment
ORDER BY avg_ltv DESC
"""

df_ltv_segmento = query_bigquery(query_ltv_segmento)
print(f"✓ Segmentação calculada para {df_ltv_segmento['customer_count'].sum():,} clientes")

In [None]:
# Visualização LTV por segmento
plt.close('all')
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# LTV Médio por Segmento
axes[0, 0].bar(df_ltv_segmento['segment'], df_ltv_segmento['avg_ltv'], edgecolor='black', alpha=0.7)
axes[0, 0].set_ylabel('LTV Médio (R$)')
axes[0, 0].set_title('LTV Médio por Segmento')

# Distribuição de Clientes
axes[0, 1].pie(df_ltv_segmento['customer_count'], labels=df_ltv_segmento['segment'], autopct='%1.1f%%', startangle=90)
axes[0, 1].set_title('Distribuição de Clientes por Segmento')

# Distribuição de Receita
axes[1, 0].pie(df_ltv_segmento['total_revenue'], labels=df_ltv_segmento['segment'], autopct='%1.1f%%', startangle=90)
axes[1, 0].set_title('Distribuição de Receita por Segmento')

# Orders vs AOV
x = np.arange(len(df_ltv_segmento))
width = 0.35
bars1 = axes[1, 1].bar(x - width/2, df_ltv_segmento['avg_orders'], width, label='Pedidos Médios', alpha=0.7)
bars2 = axes[1, 1].bar(x + width/2, df_ltv_segmento['avg_aov'], width, label='AOV Médio (R$)', alpha=0.7)
axes[1, 1].set_xlabel('Segmento')
axes[1, 1].set_ylabel('Métricas')
axes[1, 1].set_title('Pedidos vs Ticket Médio por Segmento')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(df_ltv_segmento['segment'])
axes[1, 1].legend()

plt.tight_layout()
save_plot(fig, 'ltv_by_segment.png')
plt.show()

In [None]:
# Análise segmentos
print("="*60)
print("ANÁLISE LTV POR SEGMENTO")
print("="*60)

one_time = df_ltv_segmento[df_ltv_segmento['segment'] == 'One-time'].iloc[0]
repeat = df_ltv_segmento[df_ltv_segmento['segment'] == 'Repeat'].iloc[0]
loyal = df_ltv_segmento[df_ltv_segmento['segment'] == 'Loyal'].iloc[0]

print(f"\n DISTRIBUIÇÃO DE CLIENTES:")
total_customers = df_ltv_segmento['customer_count'].sum()
for _, row in df_ltv_segmento.iterrows():
    pct = (row['customer_count'] / total_customers) * 100
    print(f"  • {row['segment']}: {row['customer_count']:,} ({pct:.1f}%)")

print(f"\n DISTRIBUIÇÃO DE RECEITA:")
total_revenue = df_ltv_segmento['total_revenue'].sum()
for _, row in df_ltv_segmento.iterrows():
    pct = (row['total_revenue'] / total_revenue) * 100
    print(f"  • {row['segment']}: R$ {row['total_revenue']:,.2f} ({pct:.1f}%)")

print(f"\n COMPARAÇÃO DE VALOR:")
print(f"  • LTV Loyal vs One-time: {loyal['avg_ltv'] / one_time['avg_ltv']:.1f}x maior")
print(f"  • LTV Repeat vs One-time: {repeat['avg_ltv'] / one_time['avg_ltv']:.1f}x maior")
print(f"  • AOV Loyal: R$ {loyal['avg_aov']:.2f} vs One-time: R$ {one_time['avg_aov']:.2f}")

print(f"\n OPORTUNIDADES DE CONVERSÃO:")
repeat_rate = ((repeat['customer_count'] + loyal['customer_count']) / total_customers) * 100
print(f"  • Taxa de recompra atual: {repeat_rate:.1f}%")
print(f"  • Potencial se converter 10% One-time → Repeat: +R$ {(one_time['customer_count'] * 0.1 * repeat['avg_ltv']):,.2f}")

## 6. Análise Pareto 

In [None]:
# Análise de Pareto
df_pareto = df_ltv_cliente[['customer_unique_id', 'lifetime_value']].sort_values('lifetime_value', ascending=False)
df_pareto = df_pareto.reset_index(drop=True)
df_pareto['cumulative_revenue'] = df_pareto['lifetime_value'].cumsum()
df_pareto['cumulative_pct_customers'] = (df_pareto.index + 1) / len(df_pareto) * 100
df_pareto['cumulative_pct_revenue'] = (df_pareto['cumulative_revenue'] / df_pareto['lifetime_value'].sum()) * 100

# Encontrar pontos de corte
top_20_customers = len(df_pareto) * 0.2
top_20_revenue = df_pareto['lifetime_value'].sum() * 0.8

pareto_80_customers = df_pareto[df_pareto['cumulative_pct_revenue'] <= 80]
pareto_80_cutoff = len(pareto_80_customers) / len(df_pareto) * 100

print(f"✓ Análise Pareto calculada para {len(df_pareto):,} clientes")

In [None]:
# Visualização Curva de Pareto
plt.close('all')
fig, ax = plt.subplots(figsize=(14, 8))

# Curva de concentração
ax.plot(df_pareto['cumulative_pct_customers'], df_pareto['cumulative_pct_revenue'], 
        linewidth=3, label='Curva de Concentração', color='blue')

# Linha de igualdade (45 graus)
ax.plot([0, 100], [0, 100], '--', color='gray', alpha=0.7, label='Igualdade Perfeita')

# Linha de 80% receita
ax.axhline(80, color='red', linestyle='--', alpha=0.7, label='80% da Receita')
ax.axvline(pareto_80_cutoff, color='red', linestyle='--', alpha=0.7)

# Anotações
ax.annotate(f'{pareto_80_cutoff:.1f}% clientes\ngeram 80% receita', 
            xy=(pareto_80_cutoff, 80), xytext=(pareto_80_cutoff+10, 70),
            arrowprops=dict(arrowstyle='->', color='red'), color='red')

ax.set_xlabel('% Acumulado de Clientes')
ax.set_ylabel('% Acumulado de Receita')
ax.set_title('Curva de Pareto - Concentração de Receita vs Clientes')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
save_plot(fig, 'ltv_pareto_curve.png')
plt.show()

In [None]:
# Segmentação por valor - Gráfico de Barras Comparativas
plt.close('all')

# Definir segmentos baseado na curva de Pareto
total_customers = len(df_pareto)
total_revenue = df_pareto['lifetime_value'].sum()

top_segment = df_pareto.head(int(total_customers * 0.20))  # Top 20%
mid_segment = df_pareto.iloc[int(total_customers * 0.20):int(total_customers * 0.80)]  # Next 60%
low_segment = df_pareto.tail(int(total_customers * 0.20))  # Bottom 20%

# Gráfico de segmentação Pareto
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Gráfico 1: Segmentação Pareto (Clientes vs Receita)
segments_data = pd.DataFrame({
    'Segment': ['Top 20%', 'Mid 60%', 'Low 20%'],
    'Customers': [len(top_segment), len(mid_segment), len(low_segment)],
    'Revenue': [top_segment['lifetime_value'].sum(), 
                mid_segment['lifetime_value'].sum(), 
                low_segment['lifetime_value'].sum()]
})

x = np.arange(len(segments_data))
width = 0.35

ax1.bar(x - width/2, segments_data['Customers'], width, label='Clientes', alpha=0.7, color='blue')
ax1.bar(x + width/2, segments_data['Revenue']/1000, width, label='Receita (R$ mil)', alpha=0.7, color='green')
ax1.set_xlabel('Segmento')
ax1.set_ylabel('Quantidade')
ax1.set_title('Segmentação Pareto: Clientes vs Receita')
ax1.set_xticks(x)
ax1.set_xticklabels(segments_data['Segment'])
ax1.legend()

# Gráfico 2: Participação Percentual
customer_pct = [len(top_segment)/total_customers*100, len(mid_segment)/total_customers*100, len(low_segment)/total_customers*100]
revenue_pct = [top_segment['lifetime_value'].sum()/total_revenue*100, mid_segment['lifetime_value'].sum()/total_revenue*100, low_segment['lifetime_value'].sum()/total_revenue*100]

ax2.bar(x - width/2, customer_pct, width, label='% Clientes', alpha=0.7, color='lightblue')
ax2.bar(x + width/2, revenue_pct, width, label='% Receita', alpha=0.7, color='lightgreen')
ax2.set_xlabel('Segmento')
ax2.set_ylabel('Percentual (%)')
ax2.set_title('Participação Percentual por Segmento')
ax2.set_xticks(x)
ax2.set_xticklabels(segments_data['Segment'])
ax2.legend()

plt.tight_layout()
save_plot(fig, 'ltv_pareto_segmentation.png')
plt.show()

In [None]:
# Análise detalhada da segmentação Pareto
print("="*60)
print("ANÁLISE DE PARETO E SEGMENTAÇÃO POR VALOR")
print("="*60)

print(f"\n SEGMENTAÇÃO PARETO (20-60-20):")
print(f"  • TOP 20%: {len(top_segment):,} clientes")
print(f"    - Receita: R$ {top_segment['lifetime_value'].sum():,.2f} ({top_segment['lifetime_value'].sum()/total_revenue*100:.1f}%)")
print(f"    - LTV Médio: R$ {top_segment['lifetime_value'].mean():.2f}")

print(f"  • MID 60%: {len(mid_segment):,} clientes")
print(f"    - Receita: R$ {mid_segment['lifetime_value'].sum():,.2f} ({mid_segment['lifetime_value'].sum()/total_revenue*100:.1f}%)")
print(f"    - LTV Médio: R$ {mid_segment['lifetime_value'].mean():.2f}")

print(f"  • LOW 20%: {len(low_segment):,} clientes")
print(f"    - Receita: R$ {low_segment['lifetime_value'].sum():,.2f} ({low_segment['lifetime_value'].sum()/total_revenue*100:.1f}%)")
print(f"    - LTV Médio: R$ {low_segment['lifetime_value'].mean():.2f}")

print(f"\n CONCENTRAÇÃO REAL:")
print(f"  • {pareto_80_cutoff:.1f}% clientes geram 80% da receita")
print(f"  • Top {len(top_segment)} clientes: {top_segment['lifetime_value'].sum()/total_revenue*100:.1f}% receita")

print(f"\n IMPLICAÇÕES ESTRATÉGICAS:")
print(f"  • Foco em reter TOP {len(top_segment):,} clientes (VIP)")
print(f"  • Upsell para MID {len(mid_segment):,} clientes")
print(f"  • Otimizar CAC para LOW {len(low_segment):,} clientes")

## 7. LTV Forecast

In [None]:
# Projeção de LTV anual
print("="*60)
print("PROJEÇÃO DE LTV ANUAL")
print("="*60)

# Filtrar clientes com histórico suficiente (>30 dias)
df_forecast = df_ltv_cliente[df_ltv_cliente['customer_lifetime_days'] >= 30].copy()

# Calcular valor por dia ativo
df_forecast['value_per_active_day'] = df_forecast['lifetime_value'] / df_forecast['customer_lifetime_days']

# Projetar LTV anual (365 dias)
df_forecast['projected_annual_ltv'] = df_forecast['value_per_active_day'] * 365

# Estatísticas da projeção
current_ltv_avg = df_forecast['lifetime_value'].mean()
projected_ltv_avg = df_forecast['projected_annual_ltv'].mean()
growth_potential = (projected_ltv_avg - current_ltv_avg) / current_ltv_avg * 100

print(f"\n BASE DE PROJEÇÃO:")
print(f"  • Clientes com 30+ dias histórico: {len(df_forecast):,}")
print(f"  • Dias médios de histórico: {df_forecast['customer_lifetime_days'].mean():.1f}")
print(f"  • Valor médio por dia: R$ {df_forecast['value_per_active_day'].mean():.2f}")

print(f"\n LTV PROJETADO (ANUAL - 365 DIAS):")
print(f"  • Médio: R$ {projected_ltv_avg:.2f}")
print(f"  • Mediano: R$ {df_forecast['projected_annual_ltv'].median():.2f}")
print(f"  • P75: R$ {df_forecast['projected_annual_ltv'].quantile(0.75):.2f}")
print(f"  • P90: R$ {df_forecast['projected_annual_ltv'].quantile(0.90):.2f}")

print(f"\n COMPARAÇÃO LTV REAL vs PROJETADO:")
print(f"  • LTV Real Médio: R$ {current_ltv_avg:.2f}")
print(f"  • LTV Projetado Anual: R$ {projected_ltv_avg:.2f}")
print(f"  • Potencial de Crescimento: {growth_potential:+.1f}%")

print(f"\n SEGMENTOS DE ALTO POTENCIAL:")
high_potential = df_forecast[df_forecast['projected_annual_ltv'] > 500]
print(f"  • Clientes com LTV projetado > R$ 500: {len(high_potential):,}")
print(f"  • LTV Médio projetado: R$ {high_potential['projected_annual_ltv'].mean():.2f}")
print(f"  • % do total: {len(high_potential)/len(df_forecast)*100:.1f}%")

In [None]:
# Visualização LTV Forecast
plt.close('all')
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Gráfico 1: Comparação Real vs Projetado
ltv_comparison = [current_ltv_avg, projected_ltv_avg]
labels_comparison = ['LTV Real', 'LTV Projetado']
colors_comparison = ['blue', 'green']

bars = axes[0].bar(labels_comparison, ltv_comparison, color=colors_comparison, alpha=0.7, edgecolor='black')
axes[0].set_ylabel('LTV Médio (R$)')
axes[0].set_title('Comparação: LTV Real vs Projetado (Anual)')
axes[0].grid(True, alpha=0.3, axis='y')

# Adicionar valores nas barras
for bar, value in zip(bars, ltv_comparison):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
                f'R$ {value:.2f}', ha='center', va='bottom', fontweight='bold')

# Gráfico 2: Distribuição LTV Projetado
axes[1].hist(df_forecast['projected_annual_ltv'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].axvline(500, color='red', linestyle='--', linewidth=2, label='Threshold Alto Potencial (R$ 500)')
axes[1].axvline(projected_ltv_avg, color='green', linestyle='--', linewidth=2, label=f'Média (R$ {projected_ltv_avg:.2f})')
axes[1].set_xlabel('LTV Projetado (R$)')
axes[1].set_ylabel('Frequência')
axes[1].set_title('Distribuição de LTV Projetado Anual')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
save_plot(fig, 'ltv_forecast.png')
plt.show()

## 8. Insights e Conclusões 

In [None]:
# Recalcular todas as variáveis necessárias para garantir disponibilidade
ltv_medio = df_ltv_cliente['lifetime_value'].mean()
ltv_mediano = df_ltv_cliente['lifetime_value'].median()
p90_ltv = df_ltv_cliente['lifetime_value'].quantile(0.90)
repeat_rate = (len(df_ltv_cliente[df_ltv_cliente['total_orders'] >= 2]) / len(df_ltv_cliente)) * 100

# Recalcular métricas Pareto
pareto_80_cutoff = len(pareto_80_customers) / len(df_pareto) * 100
top_3_share = df_ltv_estado.head(3)['gmv_share'].sum() * 100

# Recalcular métricas segmentos
one_time = df_ltv_segmento[df_ltv_segmento['segment'] == 'One-time'].iloc[0]
loyal = df_ltv_segmento[df_ltv_segmento['segment'] == 'Loyal'].iloc[0]

# Sumário executivo final
print("\n" + "="*80)
print(" RESUMO EXECUTIVO - ANÁLISE LIFETIME VALUE (LTV)")
print("="*80)

print(f"\n MÉTRICAS CHAVE LTV:")
print(f"   • LTV Médio: R$ {ltv_medio:.2f}")
print(f"   • LTV Mediano: R$ {ltv_mediano:.2f}")
print(f"   • P90 LTV: R$ {p90_ltv:.2f}")
print(f"   • Taxa de Recompra: {repeat_rate:.1f}%")

print(f"\n DISTRIBUIÇÃO DE VALOR:")
print(f"   • {pareto_80_cutoff:.1f}% clientes geram 80% da receita")
print(f"   • Top 20%: {len(top_segment):,} clientes → {(top_segment['lifetime_value'].sum()/total_revenue*100):.1f}% receita")
print(f"   • Multiplicador Loyal vs One-time: {loyal['avg_ltv'] / one_time['avg_ltv']:.1f}x")

print(f"\n PERFORMANCE GEOGRÁFICA:")
top_state_ltv = df_ltv_estado.loc[df_ltv_estado['avg_ltv'].idxmax()]
top_state_gmv = df_ltv_estado.loc[df_ltv_estado['total_gmv'].idxmax()]
print(f"   • Maior LTV Médio: {top_state_ltv['customer_state']} (R$ {top_state_ltv['avg_ltv']:.2f})")
print(f"   • Maior GMV: {top_state_gmv['customer_state']} (R$ {top_state_gmv['total_gmv']:,.2f})")
print(f"   • Concentração: Top 3 estados = {top_3_share:.1f}% GMV")

print(f"\n EVOLUÇÃO TEMPORAL:")
print(f"   • Crescimento LTV cohort: {growth_pct:+.1f}%")
print(f"   • Cohorts alta qualidade: {len(high_value_cohorts)} com LTV > R$ 200")

print(f"\n POTENCIAL FUTURO:")
print(f"   • LTV Projetado Anual: R$ {projected_ltv_avg:.2f}")
print(f"   • Potencial crescimento: {growth_potential:+.1f}%")
print(f"   • Clientes alto potencial: {len(high_potential):,} (> R$ 500 LTV)")

print(f"\n" + "="*80)
print(" RECOMENDAÇÕES ESTRATÉGICAS")
print("="*80)

print(f"\n PRIORIDADE 1: PROGRAMA VIP")
print(f"   • Foco nos {len(top_segment):,} clientes Top 20% (LTV > R$ {top_segment['lifetime_value'].min():.2f})")
print(f"   • Benefícios exclusivos, atendimento dedicado")
print(f"   • Meta: reduzir churn em 15%")

print(f"\n PRIORIDADE 2: CONVERSÃO REPEAT")
print(f"   • Campanhas pós-primeira compra (dias 7-30)")
print(f"   • Incentivos para segundo pedido")
print(f"   • Meta: aumentar taxa recompra para {repeat_rate + 5:.1f}%")

print(f"\n PRIORIDADE 3: EXPANSÃO GEOGRÁFICA")
print(f"   • Foco em estados com alta LTV per capita")
print(f"   • Otimizar logística para aumentar cobertura")
print(f"   • Meta: reduzir concentração top 3 para <60%")

print(f"\n PRIORIDADE 4: OTIMIZAÇÃO COHORTS")
print(f"   • Identificar características cohorts alta qualidade")
print(f"   • Replicar estratégias vencedoras")
print(f"   • Meta: LTV médio > R$ 180 em novos cohorts")

print(f"\n" + "="*80)
print(" ANÁLISE LTV CONCLUÍDA")
print("="*80)