# Cohort Retention Analysis

**Project:** QuintoAndar Case Study - Olist E-Commerce Analysis  
**Notebook:** 03 - Cohort Retention  
**Author:** Data Science Team  
**Date:** 2024-12-10

## Objectives:
1. Analyze customer retention patterns by cohort
2. Identify best and worst performing cohorts
3. Calculate retention decay and drop-off rates
4. Measure revenue retention over time
5. Provide actionable recommendations to improve retention

## 2. SETUP & IMPORTS

In [None]:
# Imports obrigatorios
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# Tentar importar bigquery (opcional)
try:
    from google.cloud import bigquery
    BIGQUERY_AVAILABLE = True
except ImportError:
    print("BigQuery não disponível. Usando dados simulados.")
    BIGQUERY_AVAILABLE = False

# Configuracoes
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
warnings.filterwarnings('ignore')

# Configuracao do BigQuery (se disponivel)
PROJECT_ID = 'quintoandar-ecommerce-analysis'
if BIGQUERY_AVAILABLE:
    try:
        client = bigquery.Client(project=PROJECT_ID)
        print(f"BigQuery conectado ao projeto: {PROJECT_ID}")
    except Exception as e:
        print(f"Erro ao conectar ao BigQuery: {e}")
        BIGQUERY_AVAILABLE = False

print("Setup completo!")
print(f"Data de execucao: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 3. CARREGAMENTO DE DADOS

In [None]:
def create_sample_cohort_data():
    """Cria dados de cohort de exemplo para teste"""
    np.random.seed(42)
    
    # Gerar 18 meses de cohorts
    months = pd.date_range('2023-01-01', '2024-06-01', freq='MS')
    data = []
    
    for i, cohort_month in enumerate(months):
        cohort_size = np.random.randint(800, 1500)
        
        # Para cada mes desde a primeira compra (0 a 12 meses)
        for months_since in range(0, 13):
            if i + months_since < len(months):  # Apenas meses disponiveis
                retention_rate = max(0.05, 0.7 * np.exp(-0.3 * months_since) + np.random.normal(0, 0.05))
                retention_rate = min(1.0, max(0.0, retention_rate))
                
                cohort_revenue = cohort_size * retention_rate * np.random.uniform(50, 200)
                revenue_per_customer = cohort_revenue / (cohort_size * retention_rate) if retention_rate > 0 else 0
                
                data.append({
                    'cohort_month': cohort_month.strftime('%Y-%m'),
                    'months_since_first_purchase': months_since,
                    'cohort_size': cohort_size if months_since == 0 else 0,
                    'retention_rate': retention_rate,
                    'cohort_revenue': cohort_revenue,
                    'revenue_per_customer': revenue_per_customer,
                    'cumulative_revenue': cohort_revenue * (months_since + 1) * 0.8
                })
    
    df = pd.DataFrame(data)
    
    # Ajustar cohort_size para ser consistente
    for cohort in df['cohort_month'].unique():
        cohort_size = df[(df['cohort_month'] == cohort) & (df['months_since_first_purchase'] == 0)]['cohort_size'].values[0]
        df.loc[df['cohort_month'] == cohort, 'cohort_size'] = cohort_size
    
    return df

# Carregar dados do BigQuery ou criar dados de exemplo
if BIGQUERY_AVAILABLE:
    try:
        query = f"""
        SELECT *
        FROM `{PROJECT_ID}.olist_marts.mart_customer_cohort_retention`
        ORDER BY cohort_month, months_since_first_purchase
        """
        df_cohort = client.query(query).to_dataframe()
        print("Dados carregados do BigQuery")
    except Exception as e:
        print(f"Erro ao carregar do BigQuery: {e}")
        print("Criando dados de exemplo...")
        df_cohort = create_sample_cohort_data()
else:
    print("Criando dados de exemplo...")
    df_cohort = create_sample_cohort_data()

# Validacoes obrigatorias
print(f"\nTotal de linhas carregadas: {df_cohort.shape[0]:,}")
print(f"Shape do dataset: {df_cohort.shape}")
print(f"Total de colunas: {df_cohort.shape[1]}")

print("\n" + "="*60)
print("Primeiros registros:")
print(df_cohort.head())

print("\n" + "="*60)
print("Tipos de dados:")
print(df_cohort.dtypes)

print("\n" + "="*60)
# Periodo de cohorts
if len(df_cohort) > 0:
    min_cohort = df_cohort['cohort_month'].min()
    max_cohort = df_cohort['cohort_month'].max()
    max_months = df_cohort['months_since_first_purchase'].max()
    
    print(f"Periodo de cohorts: {min_cohort} ate {max_cohort}")
    print(f"Maximo de months_since_first_purchase: {max_months} meses")
    
    # Validar se existem dados para os meses criticos
    for month in [1, 3, 6, 12]:
        if month <= max_months:
            month_data = df_cohort[df_cohort['months_since_first_purchase'] == month]
            print(f"Dados disponiveis para M{month}: {len(month_data)} cohorts")
        else:
            print(f"Dados para M{month}: Nao disponivel (max = {max_months})")
else:
    print("AVISO: DataFrame vazio.")

## 4. OVERVIEW DOS COHORTS

In [None]:
# Calcular KPIs de overview
if len(df_cohort) > 0:
    # Coletar dados do mes 0 (cohort inicial)
    df_cohort_m0 = df_cohort[df_cohort['months_since_first_purchase'] == 0]
    
    if len(df_cohort_m0) > 0:
        total_cohorts = df_cohort_m0.shape[0]
        total_customers = df_cohort_m0['cohort_size'].sum()
        avg_cohort_size = df_cohort_m0['cohort_size'].mean()
        max_cohort_size = df_cohort_m0['cohort_size'].max()
        min_cohort_size = df_cohort_m0['cohort_size'].min()
        
        print("OVERVIEW DOS COHORTS:")
        print("="*60)
        print(f"1. Total de cohorts unicos: {total_cohorts}")
        print(f"2. Total de clientes (soma de cohort_size): {total_customers:,}")
        print(f"3. Periodo de analise: {min_cohort} ate {max_cohort}")
        print(f"4. Media de cohort_size: {avg_cohort_size:.0f} clientes")
        print(f"5. Maior cohort_size: {max_cohort_size:,} clientes")
        print(f"6. Menor cohort_size: {min_cohort_size:,} clientes")
        print(f"7. Range de tamanho: {max_cohort_size - min_cohort_size:,} clientes")
        
        # Grafico de barras: cohort_size por cohort_month
        fig = px.bar(
            df_cohort_m0.sort_values('cohort_month'),
            x='cohort_month',
            y='cohort_size',
            title='Cohort Size por Mes (Novos Clientes por Mes)',
            labels={'cohort_month': 'Mes do Cohort', 'cohort_size': 'Novos Clientes'},
            text='cohort_size',
            color='cohort_size',
            color_continuous_scale='Viridis'
        )
        
        fig.update_traces(texttemplate='%{text:,}', textposition='outside')
        fig.update_layout(
            xaxis_tickangle=-45,
            showlegend=False,
            height=400
        )
        fig.show()
        
        # Analise de tendencia
        print("\n" + "="*60)
        print("INSIGHTS - Overview dos Cohorts:")
        print(f"1. Crescimento/Decrescimento: Analisando {total_cohorts} cohorts ao longo do tempo")
        print(f"2. Estabilidade: Cohorts variam de {min_cohort_size:,} a {max_cohort_size:,} clientes")
        print(f"3. Media: Cada cohort tem aproximadamente {avg_cohort_size:.0f} clientes novos")
        
        # Identificar padroes sazonais
        if total_cohorts >= 12:
            print(f"4. Sazonalidade: Dados suficientes para analise sazonal ({total_cohorts} meses)")
        else:
            print(f"4. Sazonalidade: Dados insuficientes para analise sazonal")
            
    else:
        print("AVISO: Nao ha dados para months_since_first_purchase = 0")
else:
    print("AVISO: DataFrame vazio.")

## 5. HEATMAP DE RETENCAO

In [None]:
# Criar pivot table para heatmap
if len(df_cohort) > 0:
    # Filtrar apenas M0 ate M12 (ou ate onde houver dados)
    max_months_available = min(12, df_cohort['months_since_first_purchase'].max())
    df_cohort_filtered = df_cohort[df_cohort['months_since_first_purchase'] <= max_months_available]
    
    # Pivot table
    retention_pivot = df_cohort_filtered.pivot_table(
        index='cohort_month',
        columns='months_since_first_purchase',
        values='retention_rate',
        aggfunc='first'
    )
    
    # Ordenar do mais recente para o mais antigo (inverter eixo Y)
    retention_pivot = retention_pivot.sort_index(ascending=False)
    
    print(f"Retention Matrix criada: {retention_pivot.shape[0]} cohorts x {retention_pivot.shape[1]} meses")
    print("\nPrimeiras linhas da matrix:")
    print(retention_pivot.head())
    
    # Heatmap usando seaborn
    plt.figure(figsize=(14, 10))
    sns.heatmap(
        retention_pivot,
        annot=True,
        fmt='.1%',
        cmap='RdYlGn',
        center=0.5,
        linewidths=0.5,
        linecolor='gray',
        cbar_kws={'label': 'Taxa de Retencao', 'format': '{:.0%}'}
    )
    
    plt.title('Cohort Retention Matrix (%)', fontsize=16, fontweight='bold')
    plt.xlabel('Meses desde Primeira Compra', fontsize=12)
    plt.ylabel('Mes do Cohort', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    # Calcular metricas obrigatorias
    if 1 in retention_pivot.columns:
        avg_m1_retention = retention_pivot[1].mean() * 100
    else:
        avg_m1_retention = None
        
    if 3 in retention_pivot.columns:
        avg_m3_retention = retention_pivot[3].mean() * 100
        best_m3_cohort = retention_pivot[3].idxmax()
        best_m3_value = retention_pivot[3].max() * 100
        worst_m3_cohort = retention_pivot[3].idxmin()
        worst_m3_value = retention_pivot[3].min() * 100
    else:
        avg_m3_retention = None
        
    if 6 in retention_pivot.columns:
        avg_m6_retention = retention_pivot[6].mean() * 100
    else:
        avg_m6_retention = None
    
    print("\n" + "="*60)
    print("INSIGHTS - Heatmap de Retencao:")
    if avg_m1_retention:
        print(f"1. Retention rate no M1 (media geral): {avg_m1_retention:.2f}%")
    else:
        print("1. Dados M1 nao disponiveis")
        
    if avg_m3_retention:
        print(f"2. Retention rate no M3 (media geral): {avg_m3_retention:.2f}%")
    else:
        print("2. Dados M3 nao disponiveis")
        
    if avg_m6_retention:
        print(f"3. Retention rate no M6 (media geral): {avg_m6_retention:.2f}%")
    else:
        print("3. Dados M6 nao disponiveis")
    
    if 'avg_m3_retention' in locals() and avg_m3_retention is not None:
        print(f"4. Cohort com melhor M3 retention: {best_m3_cohort} com {best_m3_value:.2f}%")
        print(f"5. Cohort com pior M3 retention: {worst_m3_cohort} com {worst_m3_value:.2f}%")
        print(f"6. Variacao M3 retention: {best_m3_value - worst_m3_value:.2f} pontos percentuais")
    
    # Padroes visuais identificados
    print(f"7. Padrao: Cohorts mais recentes tem dados completos ate M{max_months_available}")
    print(f"8. Padrao: Cohorts mais antigos tem dados ate 12+ meses")
    
    # Identificar linhas com padroes interessantes
    if len(retention_pivot) > 5:
        high_consistency = retention_pivot[retention_pivot.std(axis=1) < 0.2].index.tolist()
        if high_consistency:
            print(f"9. Cohorts com alta consistencia (baixa variacao): {', '.join(map(str, high_consistency[:3]))}")
    
else:
    print("AVISO: DataFrame vazio.")

## 6. CURVAS DE RETENCAO

In [None]:
if len(df_cohort) > 0:
    # Analise 1: Curva media geral
    avg_retention_curve = df_cohort.groupby('months_since_first_purchase')['retention_rate'].agg(['mean', 'std', 'count']).reset_index()
    avg_retention_curve.columns = ['months_since_first_purchase', 'avg_retention_rate', 'std_retention_rate', 'cohort_count']
    
    # Filtrar meses com dados suficientes
    avg_retention_curve = avg_retention_curve[avg_retention_curve['cohort_count'] >= 3]
    
    print("Curva Media de Retencao:")
    print(avg_retention_curve.head(12))
    
    # Line chart: curva media
    fig = px.line(
        avg_retention_curve,
        x='months_since_first_purchase',
        y='avg_retention_rate',
        title='Curva Media de Retencao (Todos Cohorts)',
        labels={'months_since_first_purchase': 'Meses desde Primeira Compra', 
                'avg_retention_rate': 'Taxa de Retencao Media'},
        markers=True
    )
    
    # Adicionar banda de desvio padrao
    fig.add_trace(go.Scatter(
        x=pd.concat([avg_retention_curve['months_since_first_purchase'], 
                     avg_retention_curve['months_since_first_purchase'][::-1]]),
        y=pd.concat([avg_retention_curve['avg_retention_rate'] + avg_retention_curve['std_retention_rate'], 
                     (avg_retention_curve['avg_retention_rate'] - avg_retention_curve['std_retention_rate'])[::-1]]),
        fill='toself',
        fillcolor='rgba(0,100,80,0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        name='Desvio Padrao',
        showlegend=True
    ))
    
    fig.update_layout(
        height=500,
        yaxis_tickformat='.0%',
        hovermode='x'
    )
    fig.show()
    
    # Analise 2: Curvas individuais de top cohorts
    # Selecionar cohorts com maior cohort_size ou mais recentes
    df_cohort_m0 = df_cohort[df_cohort['months_since_first_purchase'] == 0]
    
    if len(df_cohort_m0) > 0:
        # Pegar os 7 cohorts mais recentes
        top_cohorts = df_cohort_m0.sort_values('cohort_month', ascending=False).head(7)['cohort_month'].tolist()
        
        # Filtrar dados desses cohorts
        df_top_cohorts = df_cohort[df_cohort['cohort_month'].isin(top_cohorts)]
        
        # Line chart com multiplas linhas
        fig = px.line(
            df_top_cohorts,
            x='months_since_first_purchase',
            y='retention_rate',
            color='cohort_month',
            title='Curvas de Retencao - Top 7 Cohorts Mais Recentes',
            labels={'months_since_first_purchase': 'Meses desde Primeira Compra', 
                    'retention_rate': 'Taxa de Retencao',
                    'cohort_month': 'Cohort'},
            markers=True
        )
        
        fig.update_layout(
            height=500,
            yaxis_tickformat='.0%',
            hovermode='x'
        )
        fig.show()
        
        # Calcular decay rates
        if len(avg_retention_curve) >= 6:
            m1_data = avg_retention_curve[avg_retention_curve['months_since_first_purchase'] == 1]
            m3_data = avg_retention_curve[avg_retention_curve['months_since_first_purchase'] == 3]
            m6_data = avg_retention_curve[avg_retention_curve['months_since_first_purchase'] == 6]
            
            if len(m1_data) > 0 and len(m3_data) > 0:
                m1_rate = m1_data['avg_retention_rate'].iloc[0]
                m3_rate = m3_data['avg_retention_rate'].iloc[0]
                decay_m1_m3 = ((m3_rate - m1_rate) / m1_rate) * 100 if m1_rate > 0 else 0
            else:
                decay_m1_m3 = None
                
            if len(m1_data) > 0 and len(m6_data) > 0:
                m1_rate = m1_data['avg_retention_rate'].iloc[0]
                m6_rate = m6_data['avg_retention_rate'].iloc[0]
                decay_m1_m6 = ((m6_rate - m1_rate) / m1_rate) * 100 if m1_rate > 0 else 0
            else:
                decay_m1_m6 = None
            
            print("\n" + "="*60)
            print("INSIGHTS - Curvas de Retencao:")
            
            # Encontrar mes de maior drop-off
            if len(avg_retention_curve) > 1:
                avg_retention_curve['retention_change'] = avg_retention_curve['avg_retention_rate'].diff()
                if not avg_retention_curve['retention_change'].isna().all():
                    max_drop_idx = avg_retention_curve['retention_change'].idxmin()
                    max_drop_month = avg_retention_curve.loc[max_drop_idx, 'months_since_first_purchase']
                    max_drop_value = avg_retention_curve.loc[max_drop_idx, 'retention_change']
                    print(f"1. Maior drop-off: Entre M{int(max_drop_month-1)} e M{int(max_drop_month)} (queda de {abs(max_drop_value):.1%})")
            
            if decay_m1_m3 is not None:
                print(f"2. Retention M3 vs M1: Decay de {decay_m1_m3:.1f}% entre M1 e M3")
            
            if decay_m1_m6 is not None:
                print(f"3. Retention M6 vs M1: Decay de {decay_m1_m6:.1f}% entre M1 e M6")
            
            # Identificar estabilizacao
            if len(avg_retention_curve) >= 8:
                last_months = avg_retention_curve[avg_retention_curve['months_since_first_purchase'] >= 6]
                if len(last_months) > 1:
                    avg_change_last = last_months['avg_retention_rate'].diff().abs().mean()
                    if avg_change_last < 0.02:  # Menos de 2% de mudanca media
                        stabilization_month = last_months['months_since_first_purchase'].min()
                        print(f"4. Estabilizacao: Curva estabiliza a partir de M{stabilization_month} (variacao <2%)")
            
            print(f"5. Cohorts analisados: {len(top_cohorts)} cohorts mais recentes")
            print(f"6. Variacao entre cohorts: {df_top_cohorts.groupby('months_since_first_purchase')['retention_rate'].std().mean():.3f} (desvio padrao medio)")
            
    else:
        print("AVISO: Nao ha dados para cohorts no mes 0.")
else:
    print("AVISO: DataFrame vazio.")

## 7. ANALISE DE DROP-OFF

In [None]:
if len(df_cohort) > 0:
    # Calcular drop-off por cohort
    dropoff_analysis = []
    
    for cohort in df_cohort['cohort_month'].unique():
        cohort_data = df_cohort[df_cohort['cohort_month'] == cohort].sort_values('months_since_first_purchase')
        
        if len(cohort_data) > 1:
            cohort_data = cohort_data.copy()
            cohort_data['retention_change'] = cohort_data['retention_rate'].diff()
            
            # Encontrar maior drop-off neste cohort
            if cohort_data['retention_change'].min() < 0:  # So considerar quedas
                max_drop_idx = cohort_data['retention_change'].idxmin()
                max_drop_month = cohort_data.loc[max_drop_idx, 'months_since_first_purchase']
                max_drop_value = cohort_data.loc[max_drop_idx, 'retention_change']
                
                # Obter tamanho inicial do cohort
                cohort_size_data = cohort_data[cohort_data['months_since_first_purchase'] == 0]['cohort_size']
                initial_size = cohort_size_data.iloc[0] if len(cohort_size_data) > 0 else 0
                
                dropoff_analysis.append({
                    'cohort_month': cohort,
                    'max_drop_month': max_drop_month,
                    'max_drop_value': max_drop_value,
                    'initial_size': initial_size
                })
    
    if dropoff_analysis:
        dropoff_df = pd.DataFrame(dropoff_analysis)
        
        # Agrupar por mes de drop-off
        dropoff_by_month = dropoff_df.groupby('max_drop_month').agg(
            cohort_count=('cohort_month', 'count'),
            avg_drop_value=('max_drop_value', 'mean'),
            total_customers_affected=('initial_size', 'sum')
        ).reset_index()
        
        # Calcular percentual de cohorts com drop em cada mes
        total_cohorts = len(df_cohort['cohort_month'].unique())
        dropoff_by_month['cohort_percentage'] = (dropoff_by_month['cohort_count'] / total_cohorts * 100).round(1)
        
        print("Analise de Drop-off por Mes:")
        print(dropoff_by_month.sort_values('max_drop_month'))
        
        # Bar chart: drop-off por mes
        fig = px.bar(
            dropoff_by_month,
            x='max_drop_month',
            y='avg_drop_value',
            title='Drop-off Medio por Periodo (Queda na Retencao)',
            labels={'max_drop_month': 'Mes do Drop-off', 
                    'avg_drop_value': 'Queda Media na Retencao',
                    'cohort_count': 'Numero de Cohorts'},
            text='cohort_count',
            color='avg_drop_value',
            color_continuous_scale='Reds'
        )
        
        fig.update_traces(
            texttemplate='%{text} cohorts',
            textposition='outside',
            hovertemplate='Mes: %{x}<br>Queda: %{y:.1%}<br>Cohorts: %{text}')
        
        fig.update_layout(
            height=500,
            yaxis_tickformat='.0%',
            showlegend=False
        )
        fig.show()
        
        # Calcular metricas agregadas
        if len(dropoff_by_month) > 0:
            critical_month = dropoff_by_month.loc[dropoff_by_month['avg_drop_value'].idxmin(), 'max_drop_month']
            critical_drop = dropoff_by_month.loc[dropoff_by_month['avg_drop_value'].idxmin(), 'avg_drop_value']
            
            # Media de perda entre M0 e M3
            if 0 in df_cohort['months_since_first_purchase'].values and 3 in df_cohort['months_since_first_purchase'].values:
                m0_data = df_cohort[df_cohort['months_since_first_purchase'] == 0]['retention_rate']
                m3_data = df_cohort[df_cohort['months_since_first_purchase'] == 3]['retention_rate']
                
                if len(m0_data) > 0 and len(m3_data) > 0:
                    m0_avg = m0_data.mean()
                    m3_avg = m3_data.mean()
                    avg_loss_m0_m3 = m0_avg - m3_avg
                else:
                    avg_loss_m0_m3 = None
            else:
                avg_loss_m0_m3 = None
            
            print("\n" + "="*60)
            print("INSIGHTS - Analise de Drop-off:")
            print(f"1. Periodo de maior risco: Entre M{int(critical_month-1)} e M{int(critical_month)} (queda media de {abs(critical_drop):.1%})")
            print(f"2. Cohorts afetados: {dropoff_by_month.loc[dropoff_by_month['avg_drop_value'].idxmin(), 'cohort_count']} cohorts tem maior drop neste periodo")
            
            if avg_loss_m0_m3 is not None:
                print(f"3. Media de perda de clientes entre M0 e M3: {avg_loss_m0_m3:.1%}")
                
                # Calcular clientes perdidos em media
                cohort_size_data = df_cohort[df_cohort['months_since_first_purchase'] == 0]['cohort_size']
                if len(cohort_size_data) > 0:
                    avg_cohort_size = cohort_size_data.mean()
                    avg_customers_lost = avg_cohort_size * avg_loss_m0_m3
                    print(f"4. Clientes perdidos em media por cohort (M0-M3): {avg_customers_lost:.0f} clientes")
            
            print(f"5. Sugestao de timing para campanha de reativacao: Dias 30-45 (antes do drop de M1-M2)")
            
            # Identificar padroes por tamanho de cohort
            if len(dropoff_df) > 5:
                # Correlacao entre tamanho inicial e drop-off
                correlation = dropoff_df['initial_size'].corr(dropoff_df['max_drop_value'].abs())
                print(f"6. Correlacao tamanho-dropoff: {correlation:.3f}")
                
                if abs(correlation) > 0.3:
                    if correlation > 0:
                        print("   Cohorts maiores tem maior drop-off")
                    else:
                        print("   Cohorts menores tem maior drop-off")
    
    else:
        print("AVISO: Nao foi possivel calcular drop-off (dados insuficientes).")
else:
    print("AVISO: DataFrame vazio.")

## 8. COMPARACAO BEST VS WORST COHORTS

In [None]:
if len(df_cohort) > 0:
    # Filtrar apenas M3 para comparacao
    df_m3 = df_cohort[df_cohort['months_since_first_purchase'] == 3]
    
    if len(df_m3) > 0:
        # Ordenar por retention_rate
        df_m3_sorted = df_m3.sort_values('retention_rate', ascending=False)
        
        # Top 5 cohorts
        top_5 = df_m3_sorted.head(5).copy()
        top_5['retained_customers'] = top_5['cohort_size'] * top_5['retention_rate']
        
        # Bottom 5 cohorts
        bottom_5 = df_m3_sorted.tail(5).copy()
        bottom_5['retained_customers'] = bottom_5['cohort_size'] * bottom_5['retention_rate']
        
        print("TOP 5 COHORTS (Melhor M3 Retention):")
        print(top_5[['cohort_month', 'cohort_size', 'retention_rate', 'retained_customers']])
        
        print("\nBOTTOM 5 COHORTS (Pior M3 Retention):")
        print(bottom_5[['cohort_month', 'cohort_size', 'retention_rate', 'retained_customers']])
        
        # Analise comparativa
        avg_top_retention = top_5['retention_rate'].mean() * 100
        avg_bottom_retention = bottom_5['retention_rate'].mean() * 100
        retention_diff = avg_top_retention - avg_bottom_retention
        
        # Verificar se cohorts mais antigos ou mais recentes performam melhor
        top_avg_month = top_5['cohort_month'].mean() if isinstance(top_5['cohort_month'].iloc[0], (int, float)) else None
        bottom_avg_month = bottom_5['cohort_month'].mean() if isinstance(bottom_5['cohort_month'].iloc[0], (int, float)) else None
        
        # Visualizacao comparativa
        comparison_data = pd.concat([
            top_5.assign(group='Top 5'),
            bottom_5.assign(group='Bottom 5')
        ])
        
        fig = px.bar(
            comparison_data,
            x='cohort_month',
            y='retention_rate',
            color='group',
            title='Comparacao: Best vs Worst Cohorts (M3 Retention)',
            labels={'cohort_month': 'Mes do Cohort', 
                    'retention_rate': 'Retencao no M3',
                    'group': 'Grupo'},
            barmode='group',
            text='retention_rate'
        )
        
        fig.update_traces(texttemplate='%{text:.1%}', textposition='outside')
        fig.update_layout(
            height=500,
            yaxis_tickformat='.0%'
        )
        fig.show()
        
        print("\n" + "="*60)
        print("ANALISE COMPARATIVA BEST VS WORST:")
        print(f"1. Range de variacao (best - worst): {retention_diff:.1f} pontos percentuais")
        print(f"2. Media Top 5: {avg_top_retention:.1f}%")
        print(f"3. Media Bottom 5: {avg_bottom_retention:.1f}%")
        
        # Padroes temporais
        if top_avg_month is not None and bottom_avg_month is not None:
            if top_avg_month < bottom_avg_month:
                print(f"4. Padrao temporal: Cohorts mais antigos performam melhor (top: {top_avg_month}, bottom: {bottom_avg_month})")
            elif top_avg_month > bottom_avg_month:
                print(f"4. Padrao temporal: Cohorts mais recentes performam melhor (top: {top_avg_month}, bottom: {bottom_avg_month})")
            else:
                print(f"4. Padrao temporal: Sem padrao claro (top: {top_avg_month}, bottom: {bottom_avg_month})")
        else:
            print("4. Padrao temporal: Nao foi possivel analisar padrao temporal")
        
        # Hipoteses sobre causas
        print("\nHipoteses sobre causas da variacao:")
        print("1. Sazonalidade: Cohorts de diferentes epocas do ano podem ter diferentes perfis")
        print("2. Mudancas de produto: Novos features ou alteracoes no produto")
        print("3. Campanhas de aquisicao: Diferentes fontes/canais de aquisicao")
        print("4. Mudancas operacionais: Alteracoes no onboarding ou suporte")
        
        # Analise de cohort_size vs retention
        if len(df_m3) > 10:
            correlation_size_retention = df_m3['cohort_size'].corr(df_m3['retention_rate'])
            print(f"\nCorrelacao cohort_size vs retention_rate: {correlation_size_retention:.3f}")
            
            if abs(correlation_size_retention) > 0.3:
                if correlation_size_retention > 0:
                    print("Cohorts maiores tendem a ter maior retencao")
                else:
                    print("Cohorts menores tendem a ter maior retencao")
        
        # Clientes impactados
        top_customers_retained = top_5['retained_customers'].sum()
        bottom_customers_retained = bottom_5['retained_customers'].sum()
        customers_diff = top_customers_retained - bottom_customers_retained
        
        print(f"\nImpacto em clientes:")
        print(f"Top 5 cohorts retiveram: {top_customers_retained:.0f} clientes no M3")
        print(f"Bottom 5 cohorts retiveram: {bottom_customers_retained:.0f} clientes no M3")
        print(f"Diferenca: {customers_diff:.0f} clientes a mais nos top cohorts")
        
    else:
        print("AVISO: Nao ha dados para months_since_first_purchase = 3")
else:
    print("AVISO: DataFrame vazio.")

## 9. REVENUE RETENTION

In [None]:
if len(df_cohort) > 0:
    # Verificar se as colunas de receita existem
    revenue_columns = ['cohort_revenue', 'revenue_per_customer', 'cumulative_revenue']
    existing_revenue_cols = [col for col in revenue_columns if col in df_cohort.columns]
    
    if len(existing_revenue_cols) >= 2:
        # Analise de receita agregada por mes
        revenue_by_month = df_cohort.groupby('months_since_first_purchase').agg({
            'cohort_revenue': 'sum',
            'revenue_per_customer': 'mean',
            'retention_rate': 'mean'
        }).reset_index()
        
        # Calcular cumulative revenue
        if 'cumulative_revenue' in df_cohort.columns:
            cumulative_revenue_by_month = df_cohort.groupby('months_since_first_purchase')['cumulative_revenue'].mean().reset_index()
            revenue_by_month = revenue_by_month.merge(cumulative_revenue_by_month, on='months_since_first_purchase')
        else:
            # Calcular cumulative manualmente
            revenue_by_month['cumulative_revenue'] = revenue_by_month['cohort_revenue'].cumsum()
        
        print("Receita por Mes desde Primeira Compra:")
        print(revenue_by_month.head(12))
        
        # Visualizacao 1: Total revenue por mes (line + bar combinado)
        fig = make_subplots(
            rows=2, cols=1,
            subplot_titles=('Total Revenue por Mes', 'Cumulative Revenue'),
            vertical_spacing=0.15
        )
        
        # Grafico 1: Total revenue (bar)
        fig.add_trace(
            go.Bar(
                x=revenue_by_month['months_since_first_purchase'],
                y=revenue_by_month['cohort_revenue'],
                name='Total Revenue',
                marker_color='skyblue',
                text=['R$ {:,.0f}'.format(x) for x in revenue_by_month['cohort_revenue']],
                textposition='outside'
            ),
            row=1, col=1
        )
        
        # Grafico 2: Revenue per customer (line)
        fig.add_trace(
            go.Scatter(
                x=revenue_by_month['months_since_first_purchase'],
                y=revenue_by_month['revenue_per_customer'],
                name='Revenue per Customer',
                mode='lines+markers',
                line=dict(color='orange', width=3),
                yaxis='y2'
            ),
            row=1, col=1
        )
        
        # Grafico 3: Cumulative revenue (area)
        fig.add_trace(
            go.Scatter(
                x=revenue_by_month['months_since_first_purchase'],
                y=revenue_by_month['cumulative_revenue'],
                name='Cumulative Revenue',
                mode='lines',
                fill='tozeroy',
                line=dict(color='green', width=2)
            ),
            row=2, col=1
        )
        
        fig.update_layout(
            title_text='Analise de Revenue Retention',
            height=700,
            showlegend=True,
            yaxis=dict(title='Total Revenue (R$)', side='left'),
            yaxis2=dict(title='Revenue per Customer (R$)', side='right', overlaying='y'),
            yaxis3=dict(title='Cumulative Revenue (R$)', side='left')
        )
        
        fig.update_xaxes(title_text='Meses desde Primeira Compra', row=2, col=1)
        fig.show()
        
        # Calcular metricas de revenue
        total_revenue_all = revenue_by_month['cohort_revenue'].sum()
        
        m0_data = revenue_by_month[revenue_by_month['months_since_first_purchase'] == 0]
        revenue_m0 = m0_data['cohort_revenue'].iloc[0] if len(m0_data) > 0 else 0
        
        revenue_m1_m3 = revenue_by_month[revenue_by_month['months_since_first_purchase'].between(1, 3)]['cohort_revenue'].sum()
        revenue_m4_plus = revenue_by_month[revenue_by_month['months_since_first_purchase'] >= 4]['cohort_revenue'].sum()
        
        pct_m0 = (revenue_m0 / total_revenue_all * 100) if total_revenue_all > 0 else 0
        pct_m1_m3 = (revenue_m1_m3 / total_revenue_all * 100) if total_revenue_all > 0 else 0
        pct_m4_plus = (revenue_m4_plus / total_revenue_all * 100) if total_revenue_all > 0 else 0
        
        # Revenue per customer no M6 vs M0
        m0_rpc = revenue_by_month[revenue_by_month['months_since_first_purchase'] == 0]
        m6_rpc = revenue_by_month[revenue_by_month['months_since_first_purchase'] == 6]
        
        if len(m0_rpc) > 0 and len(m6_rpc) > 0:
            rpc_m0 = m0_rpc['revenue_per_customer'].iloc[0]
            rpc_m6 = m6_rpc['revenue_per_customer'].iloc[0]
            rpc_growth = ((rpc_m6 - rpc_m0) / rpc_m0 * 100) if rpc_m0 > 0 else 0
        else:
            rpc_growth = None
        
        # Valor incremental de reter cliente por mais 3 meses
        if len(revenue_by_month) >= 4:
            m0_m3_data = revenue_by_month[revenue_by_month['months_since_first_purchase'].between(0, 3)]
            m4_m6_data = revenue_by_month[revenue_by_month['months_since_first_purchase'].between(4, 6)]
            
            if len(m0_m3_data) > 0 and len(m4_m6_data) > 0:
                avg_revenue_m0_m3 = m0_m3_data['revenue_per_customer'].mean()
                avg_revenue_m4_m6 = m4_m6_data['revenue_per_customer'].mean()
                incremental_value = avg_revenue_m4_m6 - avg_revenue_m0_m3
            else:
                incremental_value = None
        else:
            incremental_value = None
        
        print("\n" + "="*60)
        print("INSIGHTS - Revenue Retention:")
        print(f"1. Distribuicao de receita:")
        print(f"   - M0 (primeira compra): {pct_m0:.1f}% da receita total")
        print(f"   - M1-M3 (primeiros 3 meses): {pct_m1_m3:.1f}% da receita total")
        print(f"   - M4+ (apos 3 meses): {pct_m4_plus:.1f}% da receita total")
        print(f"2. Total revenue analisado: R$ {total_revenue_all:,.0f}")
        
        if rpc_growth is not None:
            print(f"3. Revenue per customer no M6 vs M0: Crescimento de {rpc_growth:.1f}%")
            
        if incremental_value is not None and incremental_value > 0:
            print(f"4. Valor incremental de reter cliente por mais 3 meses (M4-M6): R$ {incremental_value:.2f} por cliente")
            
        # Analise de retencao de receita vs retencao de clientes
        if 'retention_rate' in revenue_by_month.columns:
            correlation_rev_ret = revenue_by_month['revenue_per_customer'].corr(revenue_by_month['retention_rate'])
            print(f"5. Correlacao revenue per customer vs retention rate: {correlation_rev_ret:.3f}")
            
        # Valor total em risco baseado em churn
        cohort_size_data = df_cohort[df_cohort['months_since_first_purchase'] == 0]['cohort_size']
        m3_retention_data = revenue_by_month[revenue_by_month['months_since_first_purchase'] == 3]['retention_rate']
        m3_rpc_data = revenue_by_month[revenue_by_month['months_since_first_purchase'] == 3]['revenue_per_customer']
        
        if len(cohort_size_data) > 0 and len(m3_retention_data) > 0 and len(m3_rpc_data) > 0:
            avg_cohort_size = cohort_size_data.mean()
            avg_m3_retention = m3_retention_data.iloc[0]
            avg_rpc_m3 = m3_rpc_data.iloc[0]
            churn_rate_m3 = 1 - avg_m3_retention
            avg_customers_churned = avg_cohort_size * churn_rate_m3
            revenue_at_risk_m3 = avg_customers_churned * avg_rpc_m3
            print(f"6. Revenue at risk (churn ate M3): R$ {revenue_at_risk_m3:,.0f} por cohort em media")
        
    else:
        print(f"AVISO: Colunas de receita nao encontradas. Colunas disponiveis: {list(df_cohort.columns)}")
        print("Colunas esperadas: cohort_revenue, revenue_per_customer, cumulative_revenue")
else:
    print("AVISO: DataFrame vazio.")

## 10. LTV POR COHORT

In [None]:
if len(df_cohort) > 0:
    # Verificar se cumulative_revenue existe
    if 'cumulative_revenue' in df_cohort.columns:
        # Para cada cohort, pegar o maximo de cumulative_revenue
        ltv_by_cohort = []
        
        for cohort in df_cohort['cohort_month'].unique():
            cohort_data = df_cohort[df_cohort['cohort_month'] == cohort]
            max_cumulative_rev = cohort_data['cumulative_revenue'].max()
            
            # Obter tamanho do cohort
            cohort_size_data = cohort_data[cohort_data['months_since_first_purchase'] == 0]['cohort_size']
            if len(cohort_size_data) > 0:
                cohort_size = cohort_size_data.iloc[0]
            else:
                cohort_size = 0
            
            if cohort_size > 0:
                avg_ltv = max_cumulative_rev / cohort_size
                
                # Tambem pegar M3 retention para correlacionar
                m3_retention_data = cohort_data[cohort_data['months_since_first_purchase'] == 3]['retention_rate']
                m3_retention = m3_retention_data.iloc[0] if len(m3_retention_data) > 0 else None
                
                ltv_by_cohort.append({
                    'cohort_month': cohort,
                    'cohort_size': cohort_size,
                    'total_cumulative_revenue': max_cumulative_rev,
                    'avg_ltv': avg_ltv,
                    'm3_retention': m3_retention
                })
        
        if ltv_by_cohort:
            ltv_df = pd.DataFrame(ltv_by_cohort)
            
            print("LTV por Cohort:")
            print(ltv_df.sort_values('avg_ltv', ascending=False).head(10))
            
            # Visualizacao: LTV medio por cohort
            fig = px.bar(
                ltv_df.sort_values('cohort_month'),
                x='cohort_month',
                y='avg_ltv',
                title='LTV Medio por Cohort',
                labels={'cohort_month': 'Mes do Cohort', 'avg_ltv': 'LTV Medio (R$)'},
                text='avg_ltv',
                color='avg_ltv',
                color_continuous_scale='Viridis'
            )
            
            fig.update_traces(
                texttemplate='R$ %{text:.0f}',
                textposition='outside'
            )
            
            # Adicionar linha de tendencia
            fig.add_trace(
                go.Scatter(
                    x=ltv_df.sort_values('cohort_month')['cohort_month'],
                    y=ltv_df.sort_values('cohort_month')['avg_ltv'].rolling(window=3, min_periods=1).mean(),
                    mode='lines',
                    name='Media Movel (3 meses)',
                    line=dict(color='red', width=3, dash='dash')
                )
            )
            
            fig.update_layout(
                height=500,
                xaxis_tickangle=-45,
                showlegend=True
            )
            fig.show()
            
            # Analise
            avg_ltv_all = ltv_df['avg_ltv'].mean()
            max_ltv = ltv_df['avg_ltv'].max()
            min_ltv = ltv_df['avg_ltv'].min()
            ltv_range = max_ltv - min_ltv
            
            # Correlacao entre cohort_size e LTV
            correlation_size_ltv = ltv_df['cohort_size'].corr(ltv_df['avg_ltv'])
            
            # Correlacao entre M3 retention e LTV (se disponivel)
            ltv_df_valid = ltv_df[ltv_df['m3_retention'].notna()]
            if len(ltv_df_valid) > 5:
                correlation_retention_ltv = ltv_df_valid['m3_retention'].corr(ltv_df_valid['avg_ltv'])
            else:
                correlation_retention_ltv = None
            
            # Analise temporal: Cohorts mais antigos vs mais recentes
            if len(ltv_df) > 12:
                # Dividir em early vs late cohorts
                median_date = ltv_df['cohort_month'].median()
                early_cohorts = ltv_df[ltv_df['cohort_month'] < median_date]
                late_cohorts = ltv_df[ltv_df['cohort_month'] >= median_date]
                
                avg_ltv_early = early_cohorts['avg_ltv'].mean()
                avg_ltv_late = late_cohorts['avg_ltv'].mean()
                ltv_trend = ((avg_ltv_late - avg_ltv_early) / avg_ltv_early * 100) if avg_ltv_early > 0 else 0
            else:
                avg_ltv_early = avg_ltv_late = ltv_trend = None
            
            print("\n" + "="*60)
            print("INSIGHTS - LTV por Cohort:")
            print(f"1. LTV medio geral de todos os cohorts: R$ {avg_ltv_all:.2f}")
            print(f"2. Diferenca entre melhor e pior cohort: R$ {ltv_range:.2f} ({max_ltv/min_ltv:.1f}x maior)")
            print(f"3. Melhor cohort (LTV): R$ {max_ltv:.2f}")
            print(f"4. Pior cohort (LTV): R$ {min_ltv:.2f}")
            print(f"5. Correlacao cohort_size vs LTV: {correlation_size_ltv:.3f}")
            
            if correlation_retention_ltv is not None:
                print(f"6. Correlacao M3 retention vs LTV: {correlation_retention_ltv:.3f}")
                
            if ltv_trend is not None:
                print(f"7. Tendencia temporal: Cohorts {'mais recentes' if ltv_trend > 0 else 'mais antigos'} tem LTV {'maior' if ltv_trend > 0 else 'menor'}")
                print(f"   - Early cohorts (antigos): R$ {avg_ltv_early:.2f}")
                print(f"   - Late cohorts (recentes): R$ {avg_ltv_late:.2f}")
                print(f"   - Tendencia: {ltv_trend:+.1f}%")
            
            # Scatter plot: cohort_size vs LTV
            fig = px.scatter(
                ltv_df,
                x='cohort_size',
                y='avg_ltv',
                size='cohort_size',
                color='cohort_month',
                title='Relacao: Cohort Size vs LTV Medio',
                labels={'cohort_size': 'Tamanho do Cohort', 'avg_ltv': 'LTV Medio (R$)', 'cohort_month': 'Mes do Cohort'},
                trendline='ols'
            )
            
            fig.update_layout(height=500)
            fig.show()
            
        else:
            print("AVISO: Nao foi possivel calcular LTV (dados insuficientes).")
    else:
        print("AVISO: Coluna 'cumulative_revenue' nao encontrada para calculo de LTV.")
else:
    print("AVISO: DataFrame vazio.")

## 11. SIMULACAO DE MELHORIA DE RETENCAO

In [None]:
if len(df_cohort) > 0:
    # Calcular metricas atuais
    m1_data = df_cohort[df_cohort['months_since_first_purchase'] == 1]
    m3_data = df_cohort[df_cohort['months_since_first_purchase'] == 3]
    m6_data = df_cohort[df_cohort['months_since_first_purchase'] == 6]
    
    if len(m1_data) > 0:
        current_m1_retention = m1_data['retention_rate'].mean()
    else:
        current_m1_retention = None
        
    if len(m3_data) > 0:
        current_m3_retention = m3_data['retention_rate'].mean()
    else:
        current_m3_retention = None
        
    if len(m6_data) > 0:
        current_m6_retention = m6_data['retention_rate'].mean()
    else:
        current_m6_retention = None
    
    # Calcular receita atual acumulada
    if 'cumulative_revenue' in df_cohort.columns and 'cohort_size' in df_cohort.columns:
        df_cohort_m0 = df_cohort[df_cohort['months_since_first_purchase'] == 0]
        if len(df_cohort_m0) > 0:
            avg_cohort_size = df_cohort_m0['cohort_size'].mean()
        else:
            avg_cohort_size = 1000
        
        # Encontrar maximo de cumulative_revenue por cohort
        max_cumulative_by_cohort = df_cohort.groupby('cohort_month')['cumulative_revenue'].max().reset_index()
        if len(max_cumulative_by_cohort) > 0:
            avg_cumulative_revenue = max_cumulative_by_cohort['cumulative_revenue'].mean()
            avg_revenue_per_customer = avg_cumulative_revenue / avg_cohort_size if avg_cohort_size > 0 else 0
        else:
            avg_revenue_per_customer = 500
    else:
        avg_cohort_size = 1000  # Valor padrao para simulacao
        avg_revenue_per_customer = 500  # Valor padrao para simulacao
    
    if current_m1_retention and current_m3_retention:
        print("SIMULACAO: Melhoria de Retencao")
        print("="*60)
        print(f"Cenario Atual:")
        print(f"- M1 Retention: {current_m1_retention:.1%}")
        print(f"- M3 Retention: {current_m3_retention:.1%}")
        if current_m6_retention:
            print(f"- M6 Retention: {current_m6_retention:.1%}")
        print(f"- Cohort size medio: {avg_cohort_size:.0f} clientes")
        print(f"- Revenue per customer medio: R$ {avg_revenue_per_customer:.2f}")
        
        # Cenarios de melhoria
        simulation_scenarios = []
        
        # Cenario 1: Melhorar M1 retention em +5%
        if current_m1_retention:
            new_m1_retention = min(current_m1_retention + 0.05, 1.0)
            additional_retention_m1 = new_m1_retention - current_m1_retention
            additional_customers_m1 = avg_cohort_size * additional_retention_m1
            additional_revenue_m1 = additional_customers_m1 * avg_revenue_per_customer
            campaign_cost_m1 = avg_cohort_size * 20  # R$20 por cliente
            roi_m1 = (additional_revenue_m1 - campaign_cost_m1) / campaign_cost_m1 * 100 if campaign_cost_m1 > 0 else 0
            
            simulation_scenarios.append({
                'Scenario': 'Improve M1 Retention +5%',
                'Retention Increase': '+5.0%',
                'Customers Retained': int(additional_customers_m1),
                'Additional Revenue (R$)': additional_revenue_m1,
                'Campaign Cost (R$)': campaign_cost_m1,
                'ROI': roi_m1
            })
        
        # Cenario 2: Melhorar M3 retention em +10%
        if current_m3_retention:
            new_m3_retention = min(current_m3_retention + 0.10, 1.0)
            additional_retention_m3 = new_m3_retention - current_m3_retention
            additional_customers_m3 = avg_cohort_size * additional_retention_m3
            additional_revenue_m3 = additional_customers_m3 * avg_revenue_per_customer * 2  # Multiplicador para M3
            campaign_cost_m3 = avg_cohort_size * 30  # R$30 por cliente (campanha mais cara)
            roi_m3 = (additional_revenue_m3 - campaign_cost_m3) / campaign_cost_m3 * 100 if campaign_cost_m3 > 0 else 0
            
            simulation_scenarios.append({
                'Scenario': 'Improve M3 Retention +10%',
                'Retention Increase': '+10.0%',
                'Customers Retained': int(additional_customers_m3),
                'Additional Revenue (R$)': additional_revenue_m3,
                'Campaign Cost (R$)': campaign_cost_m3,
                'ROI': roi_m3
            })
        
        # Cenario 3: Melhorar M1 e M3 simultaneamente
        if current_m1_retention and current_m3_retention:
            new_m1_retention_combo = min(current_m1_retention + 0.05, 1.0)
            new_m3_retention_combo = min(current_m3_retention + 0.08, 1.0)
            
            # Efeito composto: melhor M1 leva a melhor M3
            additional_customers_combo = avg_cohort_size * (new_m3_retention_combo - current_m3_retention)
            additional_revenue_combo = additional_customers_combo * avg_revenue_per_customer * 3  # Multiplicador maior
            campaign_cost_combo = avg_cohort_size * 40  # Campanha combinada
            roi_combo = (additional_revenue_combo - campaign_cost_combo) / campaign_cost_combo * 100 if campaign_cost_combo > 0 else 0
            
            simulation_scenarios.append({
                'Scenario': 'Improve M1 (+5%) & M3 (+8%)',
                'Retention Increase': '+5.0% (M1), +8.0% (M3)',
                'Customers Retained': int(additional_customers_combo),
                'Additional Revenue (R$)': additional_revenue_combo,
                'Campaign Cost (R$)': campaign_cost_combo,
                'ROI': roi_combo
            })
        
        # Criar DataFrame de resultados
        if simulation_scenarios:
            simulation_df = pd.DataFrame(simulation_scenarios)
            
            # Formatar valores para display
            simulation_display = simulation_df.copy()
            simulation_display['Additional Revenue (R$)'] = simulation_display['Additional Revenue (R$)'].apply(lambda x: f'R$ {x:,.0f}')
            simulation_display['Campaign Cost (R$)'] = simulation_display['Campaign Cost (R$)'].apply(lambda x: f'R$ {x:,.0f}')
            simulation_display['ROI'] = simulation_display['ROI'].apply(lambda x: f'{x:.1f}%')
            
            print("\nTabela de Simulacao de Cenarios:")
            print(simulation_display)
            
            # Grafico de barras: ROI por cenario
            fig = px.bar(
                simulation_df,
                x='Scenario',
                y='ROI',
                title='ROI de Diferentes Cenarios de Melhoria de Retencao',
                labels={'Scenario': 'Cenario', 'ROI': 'ROI (%)'},
                text='ROI',
                color='ROI',
                color_continuous_scale='RdYlGn'
            )
            
            fig.update_traces(
                texttemplate='%{text:.1f}%',
                textposition='outside'
            )
            fig.update_layout(
                height=400,
                showlegend=False
            )
            fig.show()
            
            # Identificar melhor cenario
            best_scenario_idx = simulation_df['ROI'].idxmax()
            best_scenario = simulation_df.loc[best_scenario_idx]
            
            print("\n" + "="*60)
            print("RECOMENDACAO BASEADA NA SIMULACAO:")
            print(f"Cenario ideal: {best_scenario['Scenario']}")
            print(f"ROI esperado: {best_scenario['ROI']:.1f}%")
            print(f"Clientes adicionais retidos: {best_scenario['Customers Retained']:,}")
            print(f"Receita adicional: R$ {best_scenario['Additional Revenue (R$)']:,.0f}")
            print(f"Custo da campanha: R$ {best_scenario['Campaign Cost (R$)']:,.0f}")
            
            # Pressupostos da simulacao
            print("\nPressupostos da Simulacao:")
            print(f"1. Cohort size medio: {avg_cohort_size:.0f} clientes")
            print(f"2. Revenue per customer medio: R$ {avg_revenue_per_customer:.2f}")
            print(f"3. Custo por cliente (M1): R$ 20")
            print(f"4. Custo por cliente (M3): R$ 30")
            print(f"5. Custo por cliente (combinado): R$ 40")
            print(f"6. Multiplicador de revenue (M3): 2x")
            print(f"7. Multiplicador de revenue (combinado): 3x")
            
        else:
            print("AVISO: Nao foi possivel criar cenarios de simulacao.")
    else:
        print("AVISO: Dados de retencao insuficientes para simulacao.")
else:
    print("AVISO: DataFrame vazio.")

## 12. RECOMENDACOES POR COHORT PERFORMANCE

In [None]:
if len(df_cohort) > 0:
    # Filtrar apenas M3 para classificacao
    df_m3 = df_cohort[df_cohort['months_since_first_purchase'] == 3]
    
    if len(df_m3) > 0:
        # Classificar cohorts por performance
        df_m3 = df_m3.copy()
        df_m3['retention_rank'] = df_m3['retention_rate'].rank(pct=True)
        
        # Segmentar em 3 grupos
        high_performers = df_m3[df_m3['retention_rank'] >= 0.7]
        medium_performers = df_m3[(df_m3['retention_rank'] >= 0.3) & (df_m3['retention_rank'] < 0.7)]
        low_performers = df_m3[df_m3['retention_rank'] < 0.3]
        
        # Calcular metricas por grupo
        recommendations_data = []
        
        for group_name, group_data in [
            ('High Performers', high_performers),
            ('Medium Performers', medium_performers),
            ('Low Performers', low_performers)
        ]:
            if len(group_data) > 0:
                avg_m3_retention = group_data['retention_rate'].mean() * 100
                total_cohorts = len(group_data)
                total_customers = group_data['cohort_size'].sum()
                
                # Definir acoes por grupo
                if group_name == 'High Performers':
                    action = 'Manter engajamento, programas de fidelidade premium, referrals'
                    channel = 'Email personalizado, App push, Programa de embaixadores'
                    frequency = 'Mensal'
                    expected_lift = 'Retencao estavel (+0-2%)'
                elif group_name == 'Medium Performers':
                    action = 'Campanhas de incentivo, upsell, cross-sell, onboarding melhorado'
                    channel = 'Email marketing, SMS, Notificacoes push'
                    frequency = 'Quinzenal'
                    expected_lift = 'Melhoria moderada (+3-5%)'
                else:  # Low Performers
                    action = 'Win-back agressivo, pesquisa de satisfacao, ofertas personalizadas, revisao de onboarding'
                    channel = 'Email win-back, Telefone, Pesquisa NPS, Ofertas diretas'
                    frequency = 'Semanal (primeiro mes), depois quinzenal'
                    expected_lift = 'Recuperacao significativa (+5-10%)'
                
                recommendations_data.append({
                    'Cohort Group': group_name,
                    'Cohort Count': total_cohorts,
                    'Total Customers': f'{total_customers:,}',
                    'Avg M3 Retention': f'{avg_m3_retention:.1f}%',
                    'Action': action,
                    'Channel': channel,
                    'Frequency': frequency,
                    'Expected Lift': expected_lift
                })
        
        # Criar DataFrame de recomendacoes
        if recommendations_data:
            recommendations_df = pd.DataFrame(recommendations_data)
            
            print("RECOMENDACOES POR COHORT PERFORMANCE:")
            print("="*60)
            print(recommendations_df)
            
            # Exportar low performers para CSV (se necessario)
            if len(low_performers) > 0:
                low_performers_export = low_performers[['cohort_month', 'cohort_size', 'retention_rate']].copy()
                low_performers_export['retention_rate'] = low_performers_export['retention_rate'].apply(lambda x: f'{x:.1%}')
                low_performers_export.to_csv('low_performing_cohorts.csv', index=False)
                print(f"\nLista de Low Performers exportada para 'low_performing_cohorts.csv' ({len(low_performers)} cohorts)")
            
            # Exportar retention matrix para referencia
            if 'retention_pivot' in locals():
                retention_pivot.to_csv('retention_matrix.csv')
                print("Retention matrix exportada para 'retention_matrix.csv'")
            
            print("\n" + "="*60)
            print("RESUMO DOS GRUPOS:")
            for _, row in recommendations_df.iterrows():
                print(f"\n{row['Cohort Group']}:")
                print(f"  - Cohorts: {row['Cohort Count']}")
                print(f"  - Clientes: {row['Total Customers']}")
                print(f"  - M3 Retention: {row['Avg M3 Retention']}")
                print(f"  - Acao Principal: {row['Action'][:80]}...")
                
        else:
            print("AVISO: Nao foi possivel criar recomendacoes.")
    else:
        print("AVISO: Nao ha dados para months_since_first_purchase = 3")
else:
    print("AVISO: DataFrame vazio.")

## 13. ANALISE TEMPORAL DE COHORTS

In [None]:
if len(df_cohort) > 0:
    # Criar coluna de trimestre ou semestre
    df_cohort_temp = df_cohort.copy()
    
    # Converter cohort_month para datetime se possivel
    try:
        df_cohort_temp['cohort_date'] = pd.to_datetime(df_cohort_temp['cohort_month'])
        df_cohort_temp['cohort_quarter'] = df_cohort_temp['cohort_date'].dt.to_period('Q').astype(str)
        df_cohort_temp['cohort_year'] = df_cohort_temp['cohort_date'].dt.year
        
        # Dividir em early vs late cohorts
        median_year = df_cohort_temp['cohort_year'].median()
        df_cohort_temp['cohort_period'] = df_cohort_temp['cohort_year'].apply(lambda x: 'Early' if x < median_year else 'Late')
        
        # Agrupar por periodo
        period_analysis = df_cohort_temp.groupby(['cohort_period', 'months_since_first_purchase']).agg({
            'retention_rate': 'mean',
            'cohort_size': 'mean'
        }).reset_index()
        
        print("Analise Temporal: Early vs Late Cohorts")
        print(period_analysis.head(10))
        
        # Line chart: Retention curves por periodo
        fig = px.line(
            period_analysis,
            x='months_since_first_purchase',
            y='retention_rate',
            color='cohort_period',
            title='Curvas de Retencao: Early vs Late Cohorts',
            labels={'months_since_first_purchase': 'Meses desde Primeira Compra', 
                    'retention_rate': 'Taxa de Retencao',
                    'cohort_period': 'Periodo do Cohort'},
            markers=True
        )
        
        fig.update_layout(
            height=500,
            yaxis_tickformat='.0%'
        )
        fig.show()
        
        # Analise por trimestre
        if len(df_cohort_temp['cohort_quarter'].unique()) >= 4:
            quarter_analysis = df_cohort_temp.groupby(['cohort_quarter', 'months_since_first_purchase']).agg({
                'retention_rate': 'mean'
            }).reset_index()
            
            # Filtrar apenas M3 para analise trimestral
            quarter_m3 = quarter_analysis[quarter_analysis['months_since_first_purchase'] == 3]
            
            if len(quarter_m3) > 0:
                fig = px.bar(
                    quarter_m3.sort_values('cohort_quarter'),
                    x='cohort_quarter',
                    y='retention_rate',
                    title='M3 Retention por Trimestre',
                    labels={'cohort_quarter': 'Trimestre', 'retention_rate': 'Retencao M3'},
                    text='retention_rate',
                    color='retention_rate',
                    color_continuous_scale='RdYlGn'
                )
                
                fig.update_traces(
                    texttemplate='%{text:.1%}',
                    textposition='outside'
                )
                fig.update_layout(
                    height=400,
                    xaxis_tickangle=-45
                )
                fig.show()
        
        # Calcular metricas comparativas
        early_cohorts_data = period_analysis[period_analysis['cohort_period'] == 'Early']
        late_cohorts_data = period_analysis[period_analysis['cohort_period'] == 'Late']
        
        # Encontrar M3 retention para cada periodo
        m3_early = early_cohorts_data[early_cohorts_data['months_since_first_purchase'] == 3]
        m3_late = late_cohorts_data[late_cohorts_data['months_since_first_purchase'] == 3]
        
        if len(m3_early) > 0 and len(m3_late) > 0:
            m3_early_rate = m3_early['retention_rate'].iloc[0]
            m3_late_rate = m3_late['retention_rate'].iloc[0]
            m3_change = ((m3_late_rate - m3_early_rate) / m3_early_rate * 100) if m3_early_rate > 0 else 0
        else:
            m3_early_rate = m3_late_rate = m3_change = None
        
        print("\n" + "="*60)
        print("INSIGHTS - Analise Temporal:")
        print(f"1. Periodo de analise: {df_cohort_temp['cohort_year'].min()} a {df_cohort_temp['cohort_year'].max()}")
        print(f"2. Total de cohorts: {df_cohort_temp['cohort_month'].nunique()}")
        print(f"3. Early cohorts (ate {median_year-1}): {len(df_cohort_temp[df_cohort_temp['cohort_period'] == 'Early']['cohort_month'].unique())}")
        print(f"4. Late cohorts ({median_year}+): {len(df_cohort_temp[df_cohort_temp['cohort_period'] == 'Late']['cohort_month'].unique())}")
        
        if m3_change is not None:
            print(f"5. Performance M3: Late cohorts tem retencao {m3_change:+.1f}% vs early cohorts")
            print(f"   - Early cohorts M3: {m3_early_rate:.1%}")
            print(f"   - Late cohorts M3: {m3_late_rate:.1%}")
            
            if m3_change > 0:
                print(f"6. Tendencia: Retencao MELHORANDO ao longo do tempo")
            elif m3_change < 0:
                print(f"6. Tendencia: Retencao PIORANDO ao longo do tempo")
            else:
                print(f"6. Tendencia: Retencao ESTAVEL ao longo do tempo")
        
        # Analise de sazonalidade
        if 'cohort_date' in df_cohort_temp.columns:
            df_cohort_temp['cohort_month_num'] = df_cohort_temp['cohort_date'].dt.month
            
            # Agrupar por mes do ano
            seasonal_analysis = df_cohort_temp.groupby(['cohort_month_num', 'months_since_first_purchase']).agg({
                'retention_rate': 'mean'
            }).reset_index()
            
            # Filtrar M3 para analise sazonal
            seasonal_m3 = seasonal_analysis[seasonal_analysis['months_since_first_purchase'] == 3]
            
            if len(seasonal_m3) > 0:
                month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
                seasonal_m3['month_name'] = seasonal_m3['cohort_month_num'].apply(lambda x: month_names[x-1] if 1 <= x <= 12 else str(x))
                
                best_month = seasonal_m3.loc[seasonal_m3['retention_rate'].idxmax()]
                worst_month = seasonal_m3.loc[seasonal_m3['retention_rate'].idxmin()]
                
                print(f"7. Sazonalidade - Melhor mes para aquisicao: {best_month['month_name']} ({best_month['retention_rate']:.1%} M3 retention)")
                print(f"8. Sazonalidade - Pior mes para aquisicao: {worst_month['month_name']} ({worst_month['retention_rate']:.1%} M3 retention)")
                
    except Exception as e:
        print(f"AVISO: Nao foi possivel fazer analise temporal detalhada. Erro: {e}")
        print("Tentando analise basica por ano...")
        
        # Tentar extrair ano da string
        try:
            df_cohort_temp['cohort_year'] = df_cohort_temp['cohort_month'].str[:4].astype(int)
            year_analysis = df_cohort_temp.groupby(['cohort_year', 'months_since_first_purchase']).agg({
                'retention_rate': 'mean'
            }).reset_index()
            
            # Filtrar M3
            year_m3 = year_analysis[year_analysis['months_since_first_purchase'] == 3]
            
            if len(year_m3) > 1:
                fig = px.line(
                    year_m3.sort_values('cohort_year'),
                    x='cohort_year',
                    y='retention_rate',
                    title='Evolucao da Retencao M3 por Ano',
                    labels={'cohort_year': 'Ano', 'retention_rate': 'Retencao M3'},
                    markers=True
                )
                
                fig.update_layout(
                    height=400,
                    yaxis_tickformat='.0%'
                )
                fig.show()
                
                print(f"Analise por ano: {len(year_m3)} anos analisados")
                
        except:
            print("AVISO: Analise temporal nao disponivel com formato atual dos dados.")
        
else:
    print("AVISO: DataFrame vazio.")

## 14. KEY INSIGHTS & EXECUTIVE SUMMARY

In [None]:
# Calcular metricas finais para o summary
if len(df_cohort) > 0:
    # Calcular metricas dinamicamente
    if 1 in df_cohort['months_since_first_purchase'].values:
        m1_data = df_cohort[df_cohort['months_since_first_purchase'] == 1]
        if len(m1_data) > 0:
            avg_m1_retention = m1_data['retention_rate'].mean() * 100
            m1_loss = 100 - avg_m1_retention
        else:
            avg_m1_retention = m1_loss = None
    else:
        avg_m1_retention = m1_loss = None
    
    if 3 in df_cohort['months_since_first_purchase'].values:
        m3_data = df_cohort[df_cohort['months_since_first_purchase'] == 3]
        if len(m3_data) > 0:
            avg_m3_retention = m3_data['retention_rate'].mean() * 100
            m3_active = avg_m3_retention
            # Encontrar melhor e pior cohort
            best_cohort_m3 = m3_data.loc[m3_data['retention_rate'].idxmax(), 'cohort_month']
            best_cohort_value = m3_data.loc[m3_data['retention_rate'].idxmax(), 'retention_rate'] * 100
            worst_cohort_m3 = m3_data.loc[m3_data['retention_rate'].idxmin(), 'cohort_month']
            worst_cohort_value = m3_data.loc[m3_data['retention_rate'].idxmin(), 'retention_rate'] * 100
            variation_range = best_cohort_value - worst_cohort_value
        else:
            avg_m3_retention = m3_active = None
    else:
        avg_m3_retention = m3_active = None
    
    if 6 in df_cohort['months_since_first_purchase'].values:
        m6_data = df_cohort[df_cohort['months_since_first_purchase'] == 6]
        if len(m6_data) > 0:
            avg_m6_retention = m6_data['retention_rate'].mean() * 100
        else:
            avg_m6_retention = None
    else:
        avg_m6_retention = None
    
    # Calcular drop-off (usando analise da secao 7 se disponivel)
    if 'dropoff_by_month' in locals() and len(dropoff_by_month) > 0:
        critical_drop_value = dropoff_by_month.loc[dropoff_by_month['avg_drop_value'].idxmin(), 'avg_drop_value']
        critical_drop_pct = abs(critical_drop_value) * 100
    else:
        critical_drop_pct = None
    
    # Calcular revenue impact (usando analise da secao 9 se disponivel)
    revenue_at_risk = 0
    if 'revenue_at_risk_m3' in locals():
        revenue_at_risk = revenue_at_risk_m3
    
    # Calcular ROI de retencao (usando simulacao da secao 11 se disponivel)
    roi_retention = 0
    if 'simulation_df' in locals() and len(simulation_df) > 0:
        roi_retention = simulation_df['ROI'].max()
    
    # Calcular total de clientes
    df_cohort_m0 = df_cohort[df_cohort['months_since_first_purchase'] == 0]
    if len(df_cohort_m0) > 0:
        total_customers = df_cohort_m0['cohort_size'].sum()
    else:
        total_customers = 0
    
    # Calcular clientes que poderiam ser retidos com intervencao
    if avg_m1_retention and total_customers > 0:
        intervention_success_rate = 0.15  # 15% de sucesso
        customers_saved = total_customers * (m1_loss / 100) * intervention_success_rate
    else:
        customers_saved = 0
    
    print("=" * 60)
    print("COHORT RETENTION ANALYSIS - SUMMARY")
    print("=" * 60)
    print(f"Total Cohorts Analyzed: {df_cohort['cohort_month'].nunique()}")
    print(f"Total Customers: {total_customers:,}")
    
    if avg_m1_retention:
        print(f"Average M1 Retention: {avg_m1_retention:.2f}%")
    
    if avg_m3_retention:
        print(f"Average M3 Retention: {avg_m3_retention:.2f}%")
    
    if avg_m6_retention:
        print(f"Average M6 Retention: {avg_m6_retention:.2f}%")
    
    if revenue_at_risk > 0:
        print(f"Revenue at Risk (annual): R$ {revenue_at_risk:,.0f}")
    
    if roi_retention > 0:
        print(f"Estimated ROI of Retention Campaign: {roi_retention:.1f}%")
    
    if 'variation_range' in locals() and variation_range:
        print(f"Best vs Worst Cohort Variation: {variation_range:.1f} percentage points")
    
    if 'critical_drop_pct' in locals() and critical_drop_pct:
        print(f"Critical Drop-off Period: {critical_drop_pct:.1f}% loss")
    
    print(f"Customers Potentially Saved with Intervention: {customers_saved:.0f}")
    print("=" * 60)
    
    # Atualizar o markdown com os valores calculados
    print("\n" + "="*60)
    print("EXECUTIVE SUMMARY ATUALIZADO:")
    print("="*60)
    
    summary_text = "Key Findings:\n\n"
    summary_text += "Retention Patterns:\n"
    
    if avg_m1_retention:
        summary_text += f"- M1 retention: {avg_m1_retention:.1f}% (perda de {m1_loss:.1f}% dos clientes no primeiro mes)\n"
    else:
        summary_text += "- M1 retention: Dados nao disponiveis\n"
    
    if avg_m3_retention:
        summary_text += f"- M3 retention: {avg_m3_retention:.1f}% (apenas {m3_active:.1f}% permanecem ativos apos 3 meses)\n"
    else:
        summary_text += "- M3 retention: Dados nao disponiveis\n"
    
    if avg_m6_retention:
        summary_text += f"- M6 retention: {avg_m6_retention:.1f}% (base estabilizada)\n\n"
    else:
        summary_text += "- M6 retention: Dados nao disponiveis\n\n"
    
    summary_text += "Critical Periods:\n"
    if m1_loss:
        summary_text += f"- Maior drop-off: Entre M0 e M1 (perda de {m1_loss:.1f}%)\n"
    
    if avg_m1_retention and avg_m3_retention:
        m1_m3_loss = avg_m1_retention - avg_m3_retention
        summary_text += f"- Segundo maior: Entre M1 e M3 (perda de {m1_m3_loss:.1f}%)\n"
    
    summary_text += f"- Oportunidade: Intervencao entre dias 30-60 pode reter {customers_saved:.0f} clientes\n\n"
    
    if avg_m3_retention:
        summary_text += f"Cohort Performance:\n"
        summary_text += f"- Best cohort: {best_cohort_m3} com {best_cohort_value:.1f}% M3 retention\n"
        summary_text += f"- Worst cohort: {worst_cohort_m3} com {worst_cohort_value:.1f}% M3 retention\n"
        summary_text += f"- Variacao: Range de {variation_range:.1f} percentage points\n\n"
    
    summary_text += f"Revenue Impact:\n"
    summary_text += f"- Valor total em risco (churn): R$ {revenue_at_risk:,.0f}\n"
    
    if roi_retention > 0:
        summary_text += f"- ROI de campanha de retencao: {roi_retention:.1f}%\n\n"
    else:
        summary_text += "- ROI de campanha de retencao: [CALCULAR]%\n\n"
    
    summary_text += "Recommendations Priority:\n"
    summary_text += "1. Implementar campanha onboarding (dias 0-30)\n"
    summary_text += "2. Win-back automatico em dia 45 (before M2)\n"
    summary_text += "3. Programa de fidelidade para M3+ survivors\n"
    summary_text += "4. Pesquisa de satisfacao em M1 para low-performing cohorts\n"
    
    print(summary_text)
    
else:
    print("AVISO: Nao foi possivel gerar summary (DataFrame vazio).")

print("\n" + "="*60)
print("ANALISE DE COHORT RETENTION CONCLUIDA")
print("="*60)