# Geographic LTV Analysis
Project: QuintoAndar Case Study - Olist E-Commerce Analysis
Notebook: 04 - LTV & Geographic Performance
Author: Data Science Team
Date: 2024-12-10

## Objectives:
Analyze LTV distribution across Brazilian states

Identify high-value and expansion opportunity regions

Correlate LTV with operational metrics (delivery, reviews)

Map geographic concentration and market penetration

Provide actionable recommendations for geographic expansion


## 1. SETUP & IMPORTS

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from google.cloud import bigquery
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# Configurations
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

# Setup BigQuery client
PROJECT_ID = "quintoandar-ecommerce-analysis"
client = bigquery.Client(project=PROJECT_ID)

print("Setup completed successfully!")
print(f"Project ID: {PROJECT_ID}")
print(f"Date: {datetime.now().strftime('%Y-%m-%d')}")

## 2. DATA LOADING

In [None]:
# Query 1: Geographic Performance
query_geo = f"""
SELECT *
FROM `{PROJECT_ID}.olist_marts.mart_geographic_performance`
ORDER BY state_code
"""

# Query 2: Customer LTV
query_ltv = f"""
SELECT 
    customer_id,
    customer_state,
    ltv,
    total_orders,
    total_revenue,
    avg_order_value,
    customer_lifespan_days,
    first_purchase_date,
    last_purchase_date
FROM `{PROJECT_ID}.olist_marts.mart_customer_ltv`
ORDER BY ltv DESC
"""

# Load data
try:
    df_geo = client.query(query_geo).to_dataframe()
    df_customer_ltv = client.query(query_ltv).to_dataframe()
    print("Data loaded successfully!")
except Exception as e:
    print(f"Error loading data: {e}")
    df_geo = pd.DataFrame()
    df_customer_ltv = pd.DataFrame()

# Validate data
print("\n=== VALIDATION CHECKS ===")

# Check if DataFrames are empty
if len(df_geo) > 0:
    print(f"Geographic Performance: {len(df_geo)} rows loaded")
else:
    print("Geographic Performance DataFrame is empty")

if len(df_customer_ltv) > 0:
    print(f"Customer LTV: {len(df_customer_ltv)} rows loaded")
else:
    print("Customer LTV DataFrame is empty")

# Check critical columns
critical_geo_columns = ['state_code', 'state_name', 'region', 'avg_ltv', 'total_customers', 'total_revenue']
missing_geo_cols = [col for col in critical_geo_columns if col not in df_geo.columns]
if not missing_geo_cols:
    print("All critical columns present in geographic data")
else:
    print(f"Missing columns in geographic data: {missing_geo_cols}")

critical_ltv_columns = ['customer_id', 'customer_state', 'ltv']
missing_ltv_cols = [col for col in critical_ltv_columns if col not in df_customer_ltv.columns]
if not missing_ltv_cols:
    print("All critical columns present in customer LTV data")
else:
    print(f"Missing columns in customer LTV data: {missing_ltv_cols}")

# Display basic info
print("\n=== GEOGRAPHIC DATA SHAPE ===")
print(f"Shape: {df_geo.shape}")
print("\nFirst 5 rows:")
print(df_geo.head())
print("\nData types:")
print(df_geo.dtypes)

print("\n=== CUSTOMER LTV DATA SHAPE ===")
print(f"Shape: {df_customer_ltv.shape}")
print("\nFirst 5 rows:")
print(df_customer_ltv.head())

# Statistical summary
print("\n=== STATISTICAL SUMMARY (GEOGRAPHIC DATA) ===")
if len(df_geo) > 0 and 'avg_ltv' in df_geo.columns:
    print(df_geo.describe())
else:
    print("Cannot display summary - insufficient data")

## 3. GEOGRAPHIC OVERVIEW (KPIs + CHART)

In [None]:
print("=== GEOGRAPHIC LTV OVERVIEW ===\n")

# Calculate KPIs dynamically
if len(df_geo) > 0 and 'avg_ltv' in df_geo.columns and 'total_customers' in df_geo.columns:
    # KPI 1: Total states
    total_states = df_geo['state_code'].nunique()
    
    # KPI 2: Total customers
    total_customers = df_geo['total_customers'].sum()
    
    # KPI 3: Average LTV Brazil
    avg_ltv_brazil = df_geo['avg_ltv'].mean()
    
    # KPI 4: State with highest LTV
    if len(df_geo) > 0:
        max_ltv_state = df_geo.loc[df_geo['avg_ltv'].idxmax(), 'state_code']
        max_ltv_value = df_geo['avg_ltv'].max()
    else:
        max_ltv_state = "N/A"
        max_ltv_value = 0
    
    # KPI 5: State with lowest LTV
    if len(df_geo) > 0:
        min_ltv_state = df_geo.loc[df_geo['avg_ltv'].idxmin(), 'state_code']
        min_ltv_value = df_geo['avg_ltv'].min()
    else:
        min_ltv_state = "N/A"
        min_ltv_value = 0
    
    # KPI 6: LTV range
    ltv_range = max_ltv_value - min_ltv_value
    
    # KPI 7: Coefficient of variation
    ltv_std = df_geo['avg_ltv'].std()
    ltv_cv = (ltv_std / avg_ltv_brazil * 100) if avg_ltv_brazil != 0 else 0
    
    # Display KPIs
    print(f"1. Total states analyzed: {total_states}")
    print(f"2. Total customers analyzed: {total_customers:,}")
    print(f"3. Average LTV (Brazil): R$ {avg_ltv_brazil:.2f}")
    print(f"4. State with highest LTV: {max_ltv_state} (R$ {max_ltv_value:.2f})")
    print(f"5. State with lowest LTV: {min_ltv_state} (R$ {min_ltv_value:.2f})")
    print(f"6. LTV range: R$ {ltv_range:.2f}")
    print(f"7. Coefficient of variation: {ltv_cv:.1f}%")
    
    # Calculate top and bottom states
    df_sorted = df_geo.sort_values('avg_ltv', ascending=False)
    top_3_states = df_sorted.head(3)['state_code'].tolist()
    top_3_values = df_sorted.head(3)['avg_ltv'].tolist()
    bottom_3_states = df_sorted.tail(3)['state_code'].tolist()
    bottom_3_values = df_sorted.tail(3)['avg_ltv'].tolist()
    
    # Percent difference between best and worst
    percent_diff = ((max_ltv_value - min_ltv_value) / min_ltv_value * 100) if min_ltv_value != 0 else 0
    
    # Regional pattern
    region_avg_ltv = df_geo.groupby('region')['avg_ltv'].mean().sort_values(ascending=False)
    top_region = region_avg_ltv.index[0] if len(region_avg_ltv) > 0 else "N/A"
    top_region_value = region_avg_ltv.iloc[0] if len(region_avg_ltv) > 0 else 0
    
    # Insights
    print("\n=== INSIGHTS ===")
    print(f"1. Top 3 states with highest LTV: {', '.join(top_3_states)} (R$ {top_3_values[0]:.2f}, {top_3_values[1]:.2f}, {top_3_values[2]:.2f})")
    print(f"2. Bottom 3 states with lowest LTV: {', '.join(bottom_3_states)} (R$ {bottom_3_values[0]:.2f}, {bottom_3_values[1]:.2f}, {bottom_3_values[2]:.2f})")
    print(f"3. Percent difference between best ({max_ltv_state}) and worst ({min_ltv_state}): {percent_diff:.1f}%")
    print(f"4. Regional dominance: {top_region} region has highest average LTV (R$ {top_region_value:.2f})")
    print(f"5. Opportunity: States with LTV below national average: {len(df_geo[df_geo['avg_ltv'] < avg_ltv_brazil])} out of {total_states}")
    
    # Visualization 1: Horizontal Bar Chart
    fig1 = px.bar(
        df_sorted,
        y='state_code',
        x='avg_ltv',
        orientation='h',
        color='avg_ltv',
        color_continuous_scale='RdYlGn',
        title='Average LTV by State (Sorted)',
        labels={'avg_ltv': 'Average LTV (R$)', 'state_code': 'State'},
        text='avg_ltv'
    )
    
    fig1.update_traces(
        texttemplate='R$ %{text:.0f}',
        textposition='outside'
    )
    
    fig1.update_layout(
        height=800,
        yaxis={'categoryorder': 'total ascending'},
        coloraxis_showscale=True,
        xaxis_title="Average LTV (R$)",
        yaxis_title="State"
    )
    
    fig1.show()
    
else:
    print("Insufficient data for analysis")


## 4. BRAZIL MAP - LTV HEATMAP


In [None]:
print("=== BRAZIL HEATMAP - LTV DISTRIBUTION ===\n")

if len(df_geo) > 0 and 'state_code' in df_geo.columns and 'avg_ltv' in df_geo.columns:
    # Check if we have all 27 states (26 states + DF)
    states_present = len(df_geo)
    print(f"States in data: {states_present} out of 27")
    
    if states_present < 27:
        print(f"Note: Missing {27 - states_present} state(s)")
    
    # Visualization 2: Choropleth Map
    fig2 = px.choropleth(
        df_geo,
        locations='state_code',
        locationmode='ISO-3',
        color='avg_ltv',
        hover_data=['state_name', 'total_customers', 'total_revenue', 'avg_delivery_days', 'avg_review_score'],
        color_continuous_scale='RdYlGn',
        title='Brazil LTV Heatmap by State',
        labels={'avg_ltv': 'Average LTV (R$)'},
        scope='south america'
    )
    
    fig2.update_layout(
        geo=dict(
            showframe=False,
            showcoastlines=False,
            projection_type='mercator',
            center={'lat': -14, 'lon': -55},
            lataxis_range=[-35, 5],
            lonaxis_range=[-75, -30]
        ),
        height=600,
        coloraxis_colorbar=dict(
            title="LTV (R$)",
            thickness=20,
            len=0.75
        )
    )
    
    fig2.show()
    
    # Insights
    print("\n=== INSIGHTS FROM HEATMAP ===")
    
    # Check geographic concentration
    southeast_states = ['SP', 'RJ', 'MG', 'ES']
    southeast_data = df_geo[df_geo['state_code'].isin(southeast_states)]
    
    if len(southeast_data) > 0:
        southeast_avg_ltv = southeast_data['avg_ltv'].mean()
        southeast_revenue_pct = (southeast_data['total_revenue'].sum() / df_geo['total_revenue'].sum() * 100) if df_geo['total_revenue'].sum() != 0 else 0
        
        print(f"1. Southeast region (SP, RJ, MG, ES) represents {southeast_revenue_pct:.1f}% of total revenue")
        print(f"2. Average LTV in Southeast: R$ {southeast_avg_ltv:.2f}")
    
    # Identify states with low/zero penetration
    low_customer_states = df_geo[df_geo['total_customers'] < df_geo['total_customers'].median()]
    if len(low_customer_states) > 0:
        print(f"3. Low penetration states: {', '.join(low_customer_states['state_code'].tolist())}")
    
    # Identify clusters
    high_ltv_states = df_geo[df_geo['avg_ltv'] > df_geo['avg_ltv'].quantile(0.75)]
    low_ltv_states = df_geo[df_geo['avg_ltv'] < df_geo['avg_ltv'].quantile(0.25)]
    
    if len(high_ltv_states) > 0:
        print(f"4. High LTV cluster: {', '.join(high_ltv_states['state_code'].tolist())}")
    
    if len(low_ltv_states) > 0:
        print(f"5. Low LTV cluster: {', '.join(low_ltv_states['state_code'].tolist())}")
        
else:
    print("Insufficient data for heatmap")

## 5. SCATTER PLOT: VOLUME VS LTV

In [None]:
print("=== VOLUME VS LTV ANALYSIS ===\n")

if len(df_geo) > 0 and 'total_customers' in df_geo.columns and 'avg_ltv' in df_geo.columns:
    # Calculate correlation
    correlation = df_geo['total_customers'].corr(df_geo['avg_ltv'])
    print(f"Pearson correlation between volume (customers) and LTV: {correlation:.3f}")
    
    # Identify outliers
    z_score_ltv = (df_geo['avg_ltv'] - df_geo['avg_ltv'].mean()) / df_geo['avg_ltv'].std()
    z_score_customers = (df_geo['total_customers'] - df_geo['total_customers'].mean()) / df_geo['total_customers'].std()
    
    outliers = df_geo[(abs(z_score_ltv) > 2) | (abs(z_score_customers) > 2)]
    
    if len(outliers) > 0:
        print(f"Outlier states detected: {', '.join(outliers['state_code'].tolist())}")
    
    # Calculate quadrant thresholds
    median_customers = df_geo['total_customers'].median()
    median_ltv = df_geo['avg_ltv'].median()
    
    # Classify states by quadrant
    df_geo['quadrant'] = 'Unknown'
    
    for idx, row in df_geo.iterrows():
        if row['total_customers'] >= median_customers and row['avg_ltv'] >= median_ltv:
            df_geo.at[idx, 'quadrant'] = 'High Volume, High LTV (Ideal)'
        elif row['total_customers'] >= median_customers and row['avg_ltv'] < median_ltv:
            df_geo.at[idx, 'quadrant'] = 'High Volume, Low LTV (Problem)'
        elif row['total_customers'] < median_customers and row['avg_ltv'] >= median_ltv:
            df_geo.at[idx, 'quadrant'] = 'Low Volume, High LTV (Opportunity)'
        else:
            df_geo.at[idx, 'quadrant'] = 'Low Volume, Low LTV (Challenge)'
    
    # Count states in each quadrant
    quadrant_counts = df_geo['quadrant'].value_counts()
    print("\nQuadrant Distribution:")
    for quadrant, count in quadrant_counts.items():
        print(f"  {quadrant}: {count} states")
    
    # Calculate potential market size
    if 'total_revenue' in df_geo.columns:
        opportunity_quadrant = df_geo[df_geo['quadrant'] == 'Low Volume, High LTV (Opportunity)']
        if len(opportunity_quadrant) > 0:
            opportunity_revenue = opportunity_quadrant['total_revenue'].sum()
            total_revenue = df_geo['total_revenue'].sum()
            opportunity_pct = (opportunity_revenue / total_revenue * 100) if total_revenue != 0 else 0
            print(f"\nOpportunity Quadrant Revenue: R$ {opportunity_revenue:.0f} ({opportunity_pct:.1f}% of total)")
    
    # Visualization 3: Scatter Plot
    fig3 = px.scatter(
        df_geo,
        x='total_customers',
        y='avg_ltv',
        size='total_revenue' if 'total_revenue' in df_geo.columns else None,
        color='region',
        hover_name='state_code',
        hover_data=['state_name', 'total_revenue', 'avg_order_value'],
        title='Customer Volume vs LTV by State',
        labels={
            'total_customers': 'Total Customers',
            'avg_ltv': 'Average LTV (R$)',
            'region': 'Region',
            'total_revenue': 'Total Revenue'
        },
        size_max=50
    )
    
    # Add quadrant lines
    fig3.add_shape(
        type="line",
        x0=median_customers,
        y0=df_geo['avg_ltv'].min(),
        x1=median_customers,
        y1=df_geo['avg_ltv'].max(),
        line=dict(color="gray", width=1, dash="dash")
    )
    
    fig3.add_shape(
        type="line",
        x0=df_geo['total_customers'].min(),
        y0=median_ltv,
        x1=df_geo['total_customers'].max(),
        y1=median_ltv,
        line=dict(color="gray", width=1, dash="dash")
    )
    
    # Add quadrant annotations
    fig3.add_annotation(
        x=df_geo['total_customers'].max() * 0.75,
        y=df_geo['avg_ltv'].max() * 0.75,
        text="High Volume<br>High LTV",
        showarrow=False,
        font=dict(size=10)
    )
    
    fig3.add_annotation(
        x=df_geo['total_customers'].max() * 0.75,
        y=df_geo['avg_ltv'].min() * 1.25,
        text="High Volume<br>Low LTV",
        showarrow=False,
        font=dict(size=10)
    )
    
    fig3.add_annotation(
        x=df_geo['total_customers'].min() * 1.25,
        y=df_geo['avg_ltv'].max() * 0.75,
        text="Low Volume<br>High LTV",
        showarrow=False,
        font=dict(size=10)
    )
    
    fig3.add_annotation(
        x=df_geo['total_customers'].min() * 1.25,
        y=df_geo['avg_ltv'].min() * 1.25,
        text="Low Volume<br>Low LTV",
        showarrow=False,
        font=dict(size=10)
    )
    
    fig3.update_layout(
        height=600,
        xaxis_title="Total Customers",
        yaxis_title="Average LTV (R$)"
    )
    
    fig3.show()
    
    # Detailed quadrant analysis
    print("\n=== QUADRANT ANALYSIS ===")
    
    for quadrant in ['High Volume, High LTV (Ideal)', 'High Volume, Low LTV (Problem)',
                     'Low Volume, High LTV (Opportunity)', 'Low Volume, Low LTV (Challenge)']:
        quadrant_states = df_geo[df_geo['quadrant'] == quadrant]
        if len(quadrant_states) > 0:
            states_list = ', '.join(quadrant_states['state_code'].tolist())
            avg_ltv_val = quadrant_states['avg_ltv'].mean()
            avg_customers = quadrant_states['total_customers'].mean()
            
            print(f"\n{quadrant}:")
            print(f"  States: {states_list}")
            print(f"  Avg LTV: R$ {avg_ltv_val:.2f}")
            print(f"  Avg Customers: {avg_customers:.0f}")
    
    # Insights
    print("\n=== INSIGHTS ===")
    print(f"1. Correlation between volume and LTV is {correlation:.3f} ({'positive' if correlation > 0 else 'negative'} relationship)")
    
    ideal_states = df_geo[df_geo['quadrant'] == 'High Volume, High LTV (Ideal)']
    if len(ideal_states) > 0:
        print(f"2. Ideal states (high volume, high LTV): {', '.join(ideal_states['state_code'].tolist())}")
    
    opportunity_states = df_geo[df_geo['quadrant'] == 'Low Volume, High LTV (Opportunity)']
    if len(opportunity_states) > 0:
        print(f"3. Top expansion opportunities: {', '.join(opportunity_states['state_code'].tolist())}")
        
else:
    print("Insufficient data for scatter plot")

## 6. STATE RANKING (TOP 10 + BOTTOM 10)

In [None]:
print("=== STATE RANKING ANALYSIS ===\n")

if len(df_geo) > 0 and 'total_revenue' in df_geo.columns:
    # Sort by LTV for ranking
    df_sorted_ltv = df_geo.sort_values('avg_ltv', ascending=False).reset_index(drop=True)
    df_sorted_ltv.index = df_sorted_ltv.index + 1
    
    # Calculate revenue contribution
    total_revenue = df_geo['total_revenue'].sum()
    df_geo['revenue_contribution_pct'] = (df_geo['total_revenue'] / total_revenue * 100) if total_revenue != 0 else 0
    
    # Sort by revenue for concentration analysis
    df_sorted_revenue = df_geo.sort_values('total_revenue', ascending=False).reset_index(drop=True)
    
    # Calculate cumulative revenue percentage
    df_sorted_revenue['cumulative_revenue'] = df_sorted_revenue['total_revenue'].cumsum()
    df_sorted_revenue['cumulative_pct'] = (df_sorted_revenue['cumulative_revenue'] / total_revenue * 100) if total_revenue != 0 else 0
    
    # Table 1: Top 10 States by LTV
    print("TOP 10 STATES BY LTV:")
    print("=" * 80)
    top_10_ltv = df_sorted_ltv.head(10)[['state_code', 'state_name', 'avg_ltv', 'total_customers', 'total_revenue', 'avg_order_value']]
    top_10_ltv['revenue_contribution_pct'] = (top_10_ltv['total_revenue'] / total_revenue * 100) if total_revenue != 0 else 0
    print(top_10_ltv.round(2))
    
    # Table 2: Bottom 10 States by LTV
    print("\n\nBOTTOM 10 STATES BY LTV:")
    print("=" * 80)
    bottom_10_ltv = df_sorted_ltv.tail(10)[['state_code', 'state_name', 'avg_ltv', 'total_customers', 'total_revenue', 'avg_order_value']]
    bottom_10_ltv['revenue_contribution_pct'] = (bottom_10_ltv['total_revenue'] / total_revenue * 100) if total_revenue != 0 else 0
    print(bottom_10_ltv.round(2))
    
    # Concentration metrics
    print("\n=== CONCENTRATION ANALYSIS ===")
    
    # Top 5 states revenue percentage
    top_5_revenue = df_sorted_revenue.head(5)['total_revenue'].sum()
    top_5_pct = (top_5_revenue / total_revenue * 100) if total_revenue != 0 else 0
    print(f"Top 5 states account for {top_5_pct:.1f}% of total revenue")
    
    # Top 10 states revenue percentage
    top_10_revenue = df_sorted_revenue.head(10)['total_revenue'].sum()
    top_10_pct = (top_10_revenue / total_revenue * 100) if total_revenue != 0 else 0
    print(f"Top 10 states account for {top_10_pct:.1f}% of total revenue")
    
    # Gini index calculation (simplified)
    sorted_revenue = np.sort(df_geo['total_revenue'])
    n = len(sorted_revenue)
    cumulative_revenue = np.cumsum(sorted_revenue)
    
    if total_revenue > 0 and n > 0:
        # Calculate Lorenz curve values
        lorenz = cumulative_revenue / total_revenue
        # Calculate Gini index (area between line of equality and Lorenz curve)
        gini = 1 - 2 * np.trapz(lorenz, dx=1/n)
        print(f"Gini Index (revenue concentration): {gini:.3f}")
    
    # Number of states to reach 80% revenue (Pareto)
    states_to_80 = len(df_sorted_revenue[df_sorted_revenue['cumulative_pct'] <= 80])
    print(f"Number of states to reach 80% of revenue: {states_to_80}")
    
    # Visualization 4: Pareto Chart
    fig4 = make_subplots(specs=[[{"secondary_y": True}]])
    
    # Bar chart for revenue
    fig4.add_trace(
        go.Bar(
            x=df_sorted_revenue['state_code'],
            y=df_sorted_revenue['total_revenue'],
            name="Revenue",
            marker_color='blue'
        ),
        secondary_y=False
    )
    
    # Line chart for cumulative percentage
    fig4.add_trace(
        go.Scatter(
            x=df_sorted_revenue['state_code'],
            y=df_sorted_revenue['cumulative_pct'],
            name="Cumulative %",
            mode='lines+markers',
            line=dict(color='red', width=2)
        ),
        secondary_y=True
    )
    
    # Add 80% line
    fig4.add_hline(
        y=80,
        line_dash="dash",
        line_color="green",
        annotation_text="80% Threshold",
        annotation_position="bottom right",
        secondary_y=True
    )
    
    fig4.update_layout(
        title="Pareto Chart: Revenue Concentration by State",
        xaxis_title="State (sorted by revenue)",
        height=600,
        showlegend=True
    )
    
    fig4.update_yaxes(
        title_text="Revenue (R$)",
        secondary_y=False
    )
    
    fig4.update_yaxes(
        title_text="Cumulative Percentage (%)",
        range=[0, 100],
        secondary_y=True
    )
    
    fig4.show()
    
    # Insights
    print("\n=== INSIGHTS ===")
    print(f"1. Market concentration: Top {states_to_80} states generate 80% of total revenue")
    print(f"2. Dependence risk: Top 5 states account for {top_5_pct:.1f}% of revenue")
    
    # Identify underutilized states
    avg_revenue_per_state = total_revenue / len(df_geo)
    underutilized = df_geo[df_geo['total_revenue'] < avg_revenue_per_state * 0.5]
    
    if len(underutilized) > 0:
        print(f"3. Underutilized states (revenue < 50% of average): {', '.join(underutilized['state_code'].tolist())}")
    
    # Diversification opportunity
    bottom_50_states = len(df_geo) // 2
    bottom_50_revenue = df_sorted_revenue.tail(bottom_50_states)['total_revenue'].sum()
    bottom_50_pct = (bottom_50_revenue / total_revenue * 100) if total_revenue != 0 else 0
    
    print(f"4. Bottom {bottom_50_states} states contribute only {bottom_50_pct:.1f}% of revenue - high diversification opportunity")
        
else:
    print("Insufficient data for ranking analysis")

## 7. ANALYSIS BY BRAZILIAN REGION

In [None]:
print("=== REGIONAL ANALYSIS ===\n")

if len(df_geo) > 0 and 'region' in df_geo.columns:
    # Aggregate by region
    region_analysis = df_geo.groupby('region').agg({
        'total_customers': 'sum',
        'total_revenue': 'sum',
        'avg_ltv': 'mean',
        'avg_order_value': 'mean',
        'avg_delivery_days': 'mean',
        'avg_review_score': 'mean',
        'state_code': 'count'
    }).reset_index()
    
    region_analysis = region_analysis.rename(columns={'state_code': 'state_count'})
    
    # Calculate additional metrics
    total_customers_all = region_analysis['total_customers'].sum()
    total_revenue_all = region_analysis['total_revenue'].sum()
    
    region_analysis['customer_share_pct'] = (region_analysis['total_customers'] / total_customers_all * 100) if total_customers_all != 0 else 0
    region_analysis['revenue_share_pct'] = (region_analysis['total_revenue'] / total_revenue_all * 100) if total_revenue_all != 0 else 0
    region_analysis['revenue_per_customer'] = (region_analysis['total_revenue'] / region_analysis['total_customers']) if region_analysis['total_customers'] != 0 else 0
    
    print("REGIONAL PERFORMANCE METRICS:")
    print("=" * 100)
    print(region_analysis.round(2))
    
    # Find best performing regions
    print("\n=== REGIONAL LEADERS ===")
    
    if len(region_analysis) > 0:
        # Region with highest LTV
        region_highest_ltv = region_analysis.loc[region_analysis['avg_ltv'].idxmax(), 'region']
        highest_ltv_value = region_analysis['avg_ltv'].max()
        print(f"Highest LTV region: {region_highest_ltv} (R$ {highest_ltv_value:.2f})")
        
        # Region with most customers
        region_most_customers = region_analysis.loc[region_analysis['total_customers'].idxmax(), 'region']
        most_customers_value = region_analysis['total_customers'].max()
        print(f"Highest volume region: {region_most_customers} ({most_customers_value:,} customers)")
        
        # Region with highest AOV
        region_highest_aov = region_analysis.loc[region_analysis['avg_order_value'].idxmax(), 'region']
        highest_aov_value = region_analysis['avg_order_value'].max()
        print(f"Highest AOV region: {region_highest_aov} (R$ {highest_aov_value:.2f})")
        
        # Region with best delivery
        region_best_delivery = region_analysis.loc[region_analysis['avg_delivery_days'].idxmin(), 'region']
        best_delivery_value = region_analysis['avg_delivery_days'].min()
        print(f"Fastest delivery region: {region_best_delivery} ({best_delivery_value:.1f} days)")
        
        # Region with best reviews
        region_best_reviews = region_analysis.loc[region_analysis['avg_review_score'].idxmax(), 'region']
        best_reviews_value = region_analysis['avg_review_score'].max()
        print(f"Highest review score region: {region_best_reviews} ({best_reviews_value:.2f})")
    
    # Visualization 5: Grouped Bar Chart
    fig5 = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Total Customers', 'Average LTV', 'Average Order Value', 'Revenue per Customer'),
        vertical_spacing=0.15,
        horizontal_spacing=0.15
    )
    
    # Chart 1: Total Customers
    fig5.add_trace(
        go.Bar(
            x=region_analysis['region'],
            y=region_analysis['total_customers'],
            name='Customers',
            marker_color='skyblue'
        ),
        row=1, col=1
    )
    
    # Chart 2: Average LTV
    fig5.add_trace(
        go.Bar(
            x=region_analysis['region'],
            y=region_analysis['avg_ltv'],
            name='LTV',
            marker_color='lightgreen'
        ),
        row=1, col=2
    )
    
    # Chart 3: Average Order Value
    fig5.add_trace(
        go.Bar(
            x=region_analysis['region'],
            y=region_analysis['avg_order_value'],
            name='AOV',
            marker_color='salmon'
        ),
        row=2, col=1
    )
    
    # Chart 4: Revenue per Customer
    fig5.add_trace(
        go.Bar(
            x=region_analysis['region'],
            y=region_analysis['revenue_per_customer'],
            name='Revenue/Customer',
            marker_color='gold'
        ),
        row=2, col=2
    )
    
    fig5.update_layout(
        title_text='Regional Performance Metrics',
        height=700,
        showlegend=False
    )
    
    # Update axes labels
    fig5.update_yaxes(title_text="Count", row=1, col=1)
    fig5.update_yaxes(title_text="R$", row=1, col=2)
    fig5.update_yaxes(title_text="R$", row=2, col=1)
    fig5.update_yaxes(title_text="R$", row=2, col=2)
    
    fig5.show()
    
    # Insights
    print("\n=== INSIGHTS ===")
    
    # Most profitable region
    region_highest_revenue = region_analysis.loc[region_analysis['total_revenue'].idxmax(), 'region']
    highest_revenue_value = region_analysis['total_revenue'].max()
    highest_revenue_pct = region_analysis.loc[region_analysis['total_revenue'].idxmax(), 'revenue_share_pct']
    print(f"1. Most profitable region: {region_highest_revenue} (R$ {highest_revenue_value:,.0f}, {highest_revenue_pct:.1f}% of total)")
    
    # Growth potential regions
    region_lowest_penetration = region_analysis.loc[region_analysis['customer_share_pct'].idxmin(), 'region']
    lowest_penetration_pct = region_analysis['customer_share_pct'].min()
    print(f"2. Lowest penetration region: {region_lowest_penetration} ({lowest_penetration_pct:.1f}% of customers)")
    
    # Operational challenges
    region_slowest_delivery = region_analysis.loc[region_analysis['avg_delivery_days'].idxmax(), 'region']
    slowest_delivery_value = region_analysis['avg_delivery_days'].max()
    print(f"3. Operational challenge: {region_slowest_delivery} has slowest delivery ({slowest_delivery_value:.1f} days)")
    
    # Cultural/behavioral differences
    region_lowest_reviews = region_analysis.loc[region_analysis['avg_review_score'].idxmin(), 'region']
    lowest_reviews_value = region_analysis['avg_review_score'].min()
    print(f"4. Customer satisfaction challenge: {region_lowest_reviews} has lowest review score ({lowest_reviews_value:.2f})")
    
    # Investment prioritization
    print("5. Investment priority (based on LTV growth potential):")
    region_analysis_sorted = region_analysis.sort_values('avg_ltv', ascending=False)
    for idx, row in region_analysis_sorted.iterrows():
        print(f"   {row['region']}: R$ {row['avg_ltv']:.2f} LTV, {row['customer_share_pct']:.1f}% market share")
        
else:
    print("Insufficient data for regional analysis")

## 8. CORRELATIONS: LTV VS OPERATIONAL METRICS

In [None]:
print("=== LTV CORRELATION ANALYSIS ===\n")

if len(df_geo) > 0:
    # Select columns for correlation analysis
    correlation_columns = ['avg_ltv', 'total_customers', 'avg_order_value', 
                          'avg_delivery_days', 'avg_review_score', 'total_orders']
    
    # Check which columns exist in the dataframe
    available_columns = [col for col in correlation_columns if col in df_geo.columns]
    
    if len(available_columns) >= 3:  # Need at least 3 columns for meaningful correlation
        # Create correlation matrix
        corr_matrix = df_geo[available_columns].corr()
        
        print("CORRELATION MATRIX:")
        print("=" * 60)
        print(corr_matrix.round(3))
        
        # Visualization 6: Heatmap
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr_matrix, 
                   annot=True, 
                   cmap='coolwarm', 
                   center=0,
                   square=True,
                   linewidths=1,
                   cbar_kws={"shrink": 0.8})
        plt.title('Correlation Matrix: LTV vs Operational Metrics')
        plt.tight_layout()
        plt.show()
        
        # Specific correlation analyses
        print("\n=== SPECIFIC CORRELATION ANALYSES ===")
        
        # A) LTV vs Delivery Time
        if 'avg_ltv' in df_geo.columns and 'avg_delivery_days' in df_geo.columns:
            corr_ltv_delivery = df_geo['avg_ltv'].corr(df_geo['avg_delivery_days'])
            print(f"\nA) LTV vs Delivery Time Correlation: {corr_ltv_delivery:.3f}")
            
            fig6a = px.scatter(
                df_geo,
                x='avg_delivery_days',
                y='avg_ltv',
                trendline='ols',
                hover_name='state_code',
                hover_data=['state_name', 'total_customers'],
                title='LTV vs Delivery Time',
                labels={'avg_delivery_days': 'Average Delivery Days', 'avg_ltv': 'Average LTV (R$)'}
            )
            
            fig6a.update_layout(height=500)
            fig6a.show()
            
            # Get regression results
            from sklearn.linear_model import LinearRegression
            X = df_geo[['avg_delivery_days']].dropna()
            y = df_geo.loc[X.index, 'avg_ltv']
            
            if len(X) > 0 and len(y) > 0:
                model = LinearRegression()
                model.fit(X, y)
                slope = model.coef_[0]
                print(f"   Impact: Each additional delivery day changes LTV by R$ {slope:.2f}")
        
        # B) LTV vs Review Score
        if 'avg_ltv' in df_geo.columns and 'avg_review_score' in df_geo.columns:
            corr_ltv_reviews = df_geo['avg_ltv'].corr(df_geo['avg_review_score'])
            print(f"\nB) LTV vs Review Score Correlation: {corr_ltv_reviews:.3f}")
            
            fig6b = px.scatter(
                df_geo,
                x='avg_review_score',
                y='avg_ltv',
                trendline='ols',
                hover_name='state_code',
                hover_data=['state_name', 'total_customers'],
                title='LTV vs Review Score',
                labels={'avg_review_score': 'Average Review Score', 'avg_ltv': 'Average LTV (R$)'}
            )
            
            fig6b.update_layout(height=500)
            fig6b.show()
            
            # Get regression results
            X = df_geo[['avg_review_score']].dropna()
            y = df_geo.loc[X.index, 'avg_ltv']
            
            if len(X) > 0 and len(y) > 0:
                model = LinearRegression()
                model.fit(X, y)
                slope = model.coef_[0]
                print(f"   Impact: +1 point review score changes LTV by R$ {slope:.2f}")
        
        # C) LTV vs Customer Density (if available)
        if 'avg_ltv' in df_geo.columns and 'customer_density' in df_geo.columns:
            corr_ltv_density = df_geo['avg_ltv'].corr(df_geo['customer_density'])
            print(f"\nC) LTV vs Customer Density Correlation: {corr_ltv_density:.3f}")
            
            fig6c = px.scatter(
                df_geo,
                x='customer_density',
                y='avg_ltv',
                trendline='ols',
                hover_name='state_code',
                hover_data=['state_name', 'total_customers'],
                title='LTV vs Customer Density',
                labels={'customer_density': 'Customer Density (per 100k hab)', 'avg_ltv': 'Average LTV (R$)'}
            )
            
            fig6c.update_layout(height=500)
            fig6c.show()
        
        # Insights
        print("\n=== INSIGHTS ===")
        
        # Find strongest correlation
        if len(corr_matrix) > 0:
            # Exclude self-correlations (diagonal) and flatten the matrix
            corr_values = corr_matrix.unstack()
            corr_values = corr_values[corr_values != 1]  # Remove self-correlations
            
            if len(corr_values) > 0:
                strongest_corr = corr_values.abs().idxmax()
                strongest_value = corr_matrix.loc[strongest_corr[0], strongest_corr[1]]
                
                print(f"1. Strongest correlation: {strongest_corr[0]} vs {strongest_corr[1]} (r={strongest_value:.3f})")
        
        # Find weakest correlation (excluding LTV with itself)
        ltv_correlations = corr_matrix['avg_ltv'].drop('avg_ltv', errors='ignore')
        if len(ltv_correlations) > 0:
            weakest_corr_var = ltv_correlations.abs().idxmin()
            weakest_corr_value = ltv_correlations[weakest_corr_var]
            print(f"2. Weakest LTV correlation: LTV vs {weakest_corr_var} (r={weakest_corr_value:.3f})")
        
        # Operational implications
        if 'avg_delivery_days' in df_geo.columns and 'avg_ltv' in df_geo.columns:
            print(f"3. Delivery optimization ROI: Improving delivery speed could increase LTV")
        
        if 'avg_review_score' in df_geo.columns and 'avg_ltv' in df_geo.columns:
            print(f"4. Review impact: Higher customer satisfaction correlates with higher LTV")
        
        # Causation vs correlation discussion
        print("5. Note: Correlation does not imply causation. Further testing needed to confirm causal relationships.")
        
        # Recommendations based on correlations
        print("6. Recommendations:")
        if 'avg_delivery_days' in available_columns and abs(corr_ltv_delivery) > 0.3:
            print("   - Prioritize delivery optimization in high-LTV states")
        
        if 'avg_review_score' in available_columns and corr_ltv_reviews > 0.3:
            print("   - Implement customer satisfaction programs in low-review states")
        
        # Variables that don't correlate
        low_corr_vars = []
        if 'avg_ltv' in df_geo.columns:
            for col in available_columns:
                if col != 'avg_ltv':
                    corr_value = df_geo['avg_ltv'].corr(df_geo[col])
                    if abs(corr_value) < 0.2:
                        low_corr_vars.append(col)
        
        if low_corr_vars:
            print(f"7. Low correlation variables: {', '.join(low_corr_vars)} - may not be good LTV predictors")
            
    else:
        print("Insufficient columns for correlation analysis")
else:
    print("Insufficient data for correlation analysis")

## 9. STATE SEGMENTATION (CLUSTERING)

In [None]:
print("=== STATE CLUSTERING ANALYSIS ===\n")

if len(df_geo) > 0:
    # Select features for clustering
    features = ['avg_ltv', 'total_customers', 'avg_order_value', 
                'avg_delivery_days', 'avg_review_score']
    
    # Check which features are available
    available_features = [f for f in features if f in df_geo.columns]
    
    if len(available_features) >= 3:
        # Prepare data for clustering
        clustering_data = df_geo[available_features].copy()
        
        # Handle missing values
        clustering_data = clustering_data.dropna()
        
        if len(clustering_data) > 5:  # Need enough data for clustering
            # Standardize features
            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(clustering_data)
            
            # Visualization 7: Elbow Method
            from sklearn.cluster import KMeans
            
            inertias = []
            K_range = range(2, 9)
            
            for k in K_range:
                kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
                kmeans.fit(X_scaled)
                inertias.append(kmeans.inertia_)
            
            plt.figure(figsize=(10, 6))
            plt.plot(K_range, inertias, 'bx-')
            plt.xlabel('Number of clusters (k)')
            plt.ylabel('Inertia')
            plt.title('Elbow Method for Optimal k')
            plt.grid(True)
            plt.show()
            
            # Choose k based on elbow (usually k=3 or 4)
            optimal_k = 4  # You can adjust this based on the elbow plot
            print(f"Selected number of clusters: {optimal_k}")
            
            # Apply K-Means
            kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
            clusters = kmeans.fit_predict(X_scaled)
            
            # Add cluster labels to original data
            clustering_data['cluster'] = clusters
            df_geo_clustered = df_geo.copy()
            df_geo_clustered = df_geo_clustered.loc[clustering_data.index]
            df_geo_clustered['cluster'] = clusters
            
            # Visualization 8: Scatter 2D (PCA)
            from sklearn.decomposition import PCA
            
            pca = PCA(n_components=2)
            X_pca = pca.fit_transform(X_scaled)
            
            fig8 = px.scatter(
                x=X_pca[:, 0],
                y=X_pca[:, 1],
                color=clusters.astype(str),
                hover_name=df_geo_clustered['state_code'],
                hover_data=['state_name', 'avg_ltv', 'total_customers'],
                title=f'State Clusters (K={optimal_k}) - PCA Visualization',
                labels={'x': 'Principal Component 1', 'y': 'Principal Component 2', 'color': 'Cluster'},
                color_discrete_sequence=px.colors.qualitative.Set1
            )
            
            fig8.update_layout(height=600)
            fig8.show()
            
            # Analyze clusters
            print("\n=== CLUSTER ANALYSIS ===")
            
            cluster_summary = []
            for cluster_num in range(optimal_k):
                cluster_states = df_geo_clustered[df_geo_clustered['cluster'] == cluster_num]
                
                if len(cluster_states) > 0:
                    cluster_info = {
                        'cluster': cluster_num,
                        'size': len(cluster_states),
                        'states': cluster_states['state_code'].tolist(),
                        'avg_ltv': cluster_states['avg_ltv'].mean(),
                        'avg_customers': cluster_states['total_customers'].mean(),
                        'avg_aov': cluster_states['avg_order_value'].mean() if 'avg_order_value' in cluster_states.columns else 0,
                        'avg_delivery': cluster_states['avg_delivery_days'].mean() if 'avg_delivery_days' in cluster_states.columns else 0
                    }
                    cluster_summary.append(cluster_info)
            
            # Sort clusters by LTV
            cluster_summary_sorted = sorted(cluster_summary, key=lambda x: x['avg_ltv'], reverse=True)
            
            for i, cluster in enumerate(cluster_summary_sorted):
                print(f"\nCluster {cluster['cluster']} (Rank {i+1} by LTV):")
                print(f"  Size: {cluster['size']} states")
                print(f"  States: {', '.join(cluster['states'])}")
                print(f"  Avg LTV: R$ {cluster['avg_ltv']:.2f}")
                print(f"  Avg Customers: {cluster['avg_customers']:.0f}")
                print(f"  Avg AOV: R$ {cluster['avg_aov']:.2f}")
                print(f"  Avg Delivery: {cluster['avg_delivery']:.1f} days")
                
                # Assign descriptive name
                if i == 0:
                    cluster_name = "High Value States"
                elif cluster['avg_customers'] > df_geo['total_customers'].median():
                    cluster_name = "High Volume States"
                elif cluster['avg_ltv'] < df_geo['avg_ltv'].median():
                    cluster_name = "Optimization Needed"
                else:
                    cluster_name = "Growth Potential"
                
                print(f"  Profile: {cluster_name}")
            
            # Insights
            print("\n=== INSIGHTS ===")
            
            # Identify ideal cluster
            ideal_cluster = cluster_summary_sorted[0]
            print(f"1. Ideal cluster identified: Cluster {ideal_cluster['cluster']} ({ideal_cluster['size']} states)")
            print(f"   Characteristics: High LTV (R$ {ideal_cluster['avg_ltv']:.2f}), {ideal_cluster['states']}")
            
            # Identify cluster with most potential
            potential_clusters = []
            for cluster in cluster_summary_sorted:
                if (cluster['avg_customers'] < df_geo['total_customers'].median() and 
                    cluster['avg_ltv'] > df_geo['avg_ltv'].median()):
                    potential_clusters.append(cluster)
            
            if potential_clusters:
                print(f"2. Cluster with highest growth potential: Cluster {potential_clusters[0]['cluster']}")
                print(f"   Reason: Low volume but high LTV - perfect for expansion")
            
            # Identify clusters requiring attention
            attention_clusters = []
            for cluster in cluster_summary_sorted:
                if (cluster['avg_customers'] > df_geo['total_customers'].median() and 
                    cluster['avg_ltv'] < df_geo['avg_ltv'].median()):
                    attention_clusters.append(cluster)
            
            if attention_clusters:
                print(f"3. Cluster requiring attention: Cluster {attention_clusters[0]['cluster']}")
                print(f"   Reason: High volume but low LTV - optimization needed")
            
            # Differentiated strategies
            print("4. Differentiated strategies by cluster:")
            for cluster in cluster_summary_sorted:
                if cluster == ideal_cluster:
                    print(f"   Cluster {cluster['cluster']}: Maintain leadership, premium loyalty programs")
                elif cluster in potential_clusters:
                    print(f"   Cluster {cluster['cluster']}: Aggressive marketing, market expansion")
                elif cluster in attention_clusters:
                    print(f"   Cluster {cluster['cluster']}: Customer retention, upsell/cross-sell")
                else:
                    print(f"   Cluster {cluster['cluster']}: Test new approaches, improve operations")
                    
        else:
            print("Insufficient data points for clustering")
    else:
        print("Insufficient features available for clustering")
else:
    print("Insufficient data for clustering analysis")

## 10. MARKET OPPORTUNITY ANALYSIS

In [None]:
print("=== MARKET OPPORTUNITY ANALYSIS ===\n")

if len(df_geo) > 0 and 'avg_ltv' in df_geo.columns and 'region' in df_geo.columns:
    # Calculate LTV gap
    df_geo['region_max_ltv'] = df_geo.groupby('region')['avg_ltv'].transform('max')
    df_geo['ltv_gap'] = df_geo['region_max_ltv'] - df_geo['avg_ltv']
    
    # Calculate LTV gap percentage
    df_geo['ltv_gap_pct'] = (df_geo['ltv_gap'] / df_geo['avg_ltv'] * 100) if df_geo['avg_ltv'] != 0 else 0
    
    # Calculate revenue opportunity
    df_geo['revenue_opportunity'] = df_geo['total_customers'] * df_geo['ltv_gap']
    
    # Calculate normalized scores
    df_geo['volume_score'] = 0
    df_geo['gap_score'] = 0
    
    if df_geo['total_customers'].max() > df_geo['total_customers'].min():
        df_geo['volume_score'] = (df_geo['total_customers'] - df_geo['total_customers'].min()) / \
                                 (df_geo['total_customers'].max() - df_geo['total_customers'].min())
    
    if df_geo['ltv_gap_pct'].max() > df_geo['ltv_gap_pct'].min():
        df_geo['gap_score'] = (df_geo['ltv_gap_pct'] - df_geo['ltv_gap_pct'].min()) / \
                              (df_geo['ltv_gap_pct'].max() - df_geo['ltv_gap_pct'].min())
    
    # Calculate opportunity score (60% gap, 40% volume)
    df_geo['opportunity_score'] = 0.6 * df_geo['gap_score'] + 0.4 * df_geo['volume_score']
    
    # Sort by opportunity score
    df_opportunity = df_geo.sort_values('opportunity_score', ascending=False).reset_index(drop=True)
    
    # Visualization 9: Top 10 Opportunities
    top_10_opportunities = df_opportunity.head(10)
    
    fig9 = px.bar(
        top_10_opportunities,
        y='state_code',
        x='revenue_opportunity',
        color='ltv_gap_pct',
        orientation='h',
        title='Top 10 Revenue Opportunities by State',
        labels={
            'revenue_opportunity': 'Revenue Opportunity (R$)',
            'state_code': 'State',
            'ltv_gap_pct': 'LTV Gap %'
        },
        color_continuous_scale='Viridis',
        text='revenue_opportunity'
    )
    
    fig9.update_traces(
        texttemplate='R$ %{text:,.0f}',
        textposition='outside'
    )
    
    fig9.update_layout(
        height=600,
        yaxis={'categoryorder': 'total ascending'},
        coloraxis_colorbar=dict(title="LTV Gap %"),
        xaxis_title="Revenue Opportunity (R$)",
        yaxis_title="State"
    )
    
    fig9.show()
    
    # Create opportunity table
    print("TOP 10 STATES FOR EXPANSION:")
    print("=" * 120)
    
    opportunity_table = df_opportunity.head(10)[[
        'state_code', 'state_name', 'region', 
        'opportunity_score', 'ltv_gap_pct', 'revenue_opportunity',
        'avg_ltv', 'region_max_ltv', 'total_customers'
    ]].copy()
    
    opportunity_table['current_ltv'] = opportunity_table['avg_ltv']
    opportunity_table['target_ltv'] = opportunity_table['region_max_ltv']
    opportunity_table['roi_estimated'] = opportunity_table['revenue_opportunity'] / opportunity_table['total_customers'] * 100
    
    # Format the table
    display_table = opportunity_table[[
        'state_code', 'opportunity_score', 'ltv_gap_pct', 
        'revenue_opportunity', 'current_ltv', 'target_ltv'
    ]].round(2)
    
    display_table['opportunity_score'] = display_table['opportunity_score'].round(3)
    display_table['revenue_opportunity'] = display_table['revenue_opportunity'].apply(lambda x: f"R$ {x:,.0f}")
    display_table['current_ltv'] = display_table['current_ltv'].apply(lambda x: f"R$ {x:.2f}")
    display_table['target_ltv'] = display_table['target_ltv'].apply(lambda x: f"R$ {x:.2f}")
    
    print(display_table.to_string(index=False))
    
    # Calculate total opportunity
    total_revenue_opportunity = df_geo['revenue_opportunity'].sum()
    total_current_revenue = df_geo['total_revenue'].sum()
    opportunity_pct = (total_revenue_opportunity / total_current_revenue * 100) if total_current_revenue != 0 else 0
    
    print(f"\nTotal Revenue Opportunity: R$ {total_revenue_opportunity:,.0f}")
    print(f"Percentage of Current Revenue: {opportunity_pct:.1f}%")
    
    # Insights
    print("\n=== INSIGHTS ===")
    
    # Top 3 priority states
    top_3_priority = df_opportunity.head(3)['state_code'].tolist()
    top_3_opportunity = df_opportunity.head(3)['revenue_opportunity'].sum()
    print(f"1. Top 3 priority states for expansion: {', '.join(top_3_priority)}")
    print(f"   Combined opportunity: R$ {top_3_opportunity:,.0f}")
    
    # Quick wins (high score, easy execution)
    # Assuming "easy execution" means states with high volume score (existing infrastructure)
    quick_wins = df_opportunity[
        (df_opportunity['opportunity_score'] > df_opportunity['opportunity_score'].median()) &
        (df_opportunity['volume_score'] > 0.5)
    ].head(5)
    
    if len(quick_wins) > 0:
        print(f"2. Quick wins (high opportunity, existing infrastructure):")
        for _, row in quick_wins.iterrows():
            print(f"   {row['state_code']}: R$ {row['revenue_opportunity']:,.0f} opportunity")
    
    # Long-term opportunities
    long_term = df_opportunity[
        (df_opportunity['opportunity_score'] > df_opportunity['opportunity_score'].median()) &
        (df_opportunity['volume_score'] < 0.3)
    ].head(3)
    
    if len(long_term) > 0:
        print(f"3. Long-term opportunities (high potential, low current volume):")
        for _, row in long_term.iterrows():
            print(f"   {row['state_code']}: R$ {row['revenue_opportunity']:,.0f} opportunity")
    
    # Investment vs return
    estimated_investment = total_revenue_opportunity * 0.1  # Assuming 10% investment of opportunity
    estimated_roi = (total_revenue_opportunity / estimated_investment * 100) if estimated_investment != 0 else 0
    
    print(f"4. Estimated investment needed: R$ {estimated_investment:,.0f}")
    print(f"5. Projected ROI: {estimated_roi:.1f}%")
    
else:
    print("Insufficient data for market opportunity analysis")

## 11. MARKET PENETRATION ANALYSIS

In [None]:
print("=== MARKET PENETRATION ANALYSIS ===\n")

# Note: This section requires population data. If not available in the dataset,
# we'll use alternative approaches or skip certain calculations.

if len(df_geo) > 0:
    # Check if we have population data
    if 'customer_density' in df_geo.columns:
        # Use customer_density as proxy for penetration
        df_geo['penetration_rate'] = df_geo['customer_density']
        print("Using customer_density as penetration proxy")
    else:
        # Create a simple penetration metric based on customers
        # This is a relative penetration metric
        max_customers = df_geo['total_customers'].max()
        if max_customers > 0:
            df_geo['penetration_rate'] = (df_geo['total_customers'] / max_customers) * 100
            print("Using relative customer count as penetration proxy")
        else:
            print("Insufficient data for penetration analysis")
            df_geo['penetration_rate'] = 0
    
    if 'penetration_rate' in df_geo.columns:
        # Calculate penetration thresholds
        median_penetration = df_geo['penetration_rate'].median()
        median_ltv = df_geo['avg_ltv'].median()
        
        # Classify states by penetration-LTV quadrant
        df_geo['penetration_quadrant'] = 'Unknown'
        
        for idx, row in df_geo.iterrows():
            if row['penetration_rate'] >= median_penetration and row['avg_ltv'] >= median_ltv:
                df_geo.at[idx, 'penetration_quadrant'] = 'High Penetration, High LTV'
            elif row['penetration_rate'] >= median_penetration and row['avg_ltv'] < median_ltv:
                df_geo.at[idx, 'penetration_quadrant'] = 'High Penetration, Low LTV'
            elif row['penetration_rate'] < median_penetration and row['avg_ltv'] >= median_ltv:
                df_geo.at[idx, 'penetration_quadrant'] = 'Low Penetration, High LTV'
            else:
                df_geo.at[idx, 'penetration_quadrant'] = 'Low Penetration, Low LTV'
        
        # Visualization 10: Scatter Plot - Penetration vs LTV
        fig10 = px.scatter(
            df_geo,
            x='penetration_rate',
            y='avg_ltv',
            size='total_revenue',
            color='region',
            hover_name='state_code',
            hover_data=['state_name', 'total_customers', 'penetration_quadrant'],
            title='Market Penetration vs LTV',
            labels={
                'penetration_rate': 'Market Penetration Rate',
                'avg_ltv': 'Average LTV (R$)',
                'total_revenue': 'Total Revenue',
                'region': 'Region'
            },
            size_max=50
        )
        
        # Add quadrant lines
        fig10.add_shape(
            type="line",
            x0=median_penetration,
            y0=df_geo['avg_ltv'].min(),
            x1=median_penetration,
            y1=df_geo['avg_ltv'].max(),
            line=dict(color="gray", width=1, dash="dash")
        )
        
        fig10.add_shape(
            type="line",
            x0=df_geo['penetration_rate'].min(),
            y0=median_ltv,
            x1=df_geo['penetration_rate'].max(),
            y1=median_ltv,
            line=dict(color="gray", width=1, dash="dash")
        )
        
        # Add quadrant annotations
        fig10.add_annotation(
            x=df_geo['penetration_rate'].max() * 0.75,
            y=df_geo['avg_ltv'].max() * 0.75,
            text="High Penetration<br>High LTV",
            showarrow=False,
            font=dict(size=10)
        )
        
        fig10.add_annotation(
            x=df_geo['penetration_rate'].max() * 0.75,
            y=df_geo['avg_ltv'].min() * 1.25,
            text="High Penetration<br>Low LTV",
            showarrow=False,
            font=dict(size=10)
        )
        
        fig10.add_annotation(
            x=df_geo['penetration_rate'].min() * 1.25,
            y=df_geo['avg_ltv'].max() * 0.75,
            text="Low Penetration<br>High LTV",
            showarrow=False,
            font=dict(size=10)
        )
        
        fig10.add_annotation(
            x=df_geo['penetration_rate'].min() * 1.25,
            y=df_geo['avg_ltv'].min() * 1.25,
            text="Low Penetration<br>Low LTV",
            showarrow=False,
            font=dict(size=10)
        )
        
        fig10.update_layout(
            height=600,
            xaxis_title="Market Penetration Rate",
            yaxis_title="Average LTV (R$)"
        )
        
        fig10.show()
        
        # Calculate correlation
        penetration_ltv_corr = df_geo['penetration_rate'].corr(df_geo['avg_ltv'])
        
        # Quadrant analysis
        quadrant_counts = df_geo['penetration_quadrant'].value_counts()
        
        print("\n=== PENETRATION ANALYSIS ===")
        print(f"Correlation between penetration and LTV: {penetration_ltv_corr:.3f}")
        print("\nQuadrant Distribution:")
        for quadrant, count in quadrant_counts.items():
            print(f"  {quadrant}: {count} states")
        
        # Detailed quadrant analysis
        print("\n=== QUADRANT INSIGHTS ===")
        
        for quadrant in ['High Penetration, High LTV', 'High Penetration, Low LTV',
                         'Low Penetration, High LTV', 'Low Penetration, Low LTV']:
            quadrant_states = df_geo[df_geo['penetration_quadrant'] == quadrant]
            if len(quadrant_states) > 0:
                states_list = ', '.join(quadrant_states['state_code'].tolist())
                print(f"\n{quadrant}:")
                print(f"  States: {states_list}")
                
                if quadrant == 'Low Penetration, High LTV':
                    print(f"  Strategy: High expansion priority (sweet spot)")
                elif quadrant == 'High Penetration, Low LTV':
                    print(f"  Strategy: Focus on upsell/cross-sell")
                elif quadrant == 'High Penetration, High LTV':
                    print(f"  Strategy: Maintain leadership, premium services")
                else:
                    print(f"  Strategy: Test market fit, consider pilot programs")
        
        # Insights
        print("\n=== INSIGHTS ===")
        
        # Identify saturated vs underpenetrated markets
        saturated_states = df_geo[df_geo['penetration_quadrant'] == 'High Penetration, Low LTV']
        underpenetrated_states = df_geo[df_geo['penetration_quadrant'] == 'Low Penetration, High LTV']
        
        if len(saturated_states) > 0:
            print(f"1. Saturated markets (high penetration, low LTV): {len(saturated_states)} states")
            print(f"   Examples: {', '.join(saturated_states.head(3)['state_code'].tolist())}")
        
        if len(underpenetrated_states) > 0:
            print(f"2. Underpenetrated markets (low penetration, high LTV): {len(underpenetrated_states)} states")
            print(f"   Examples: {', '.join(underpenetrated_states.head(3)['state_code'].tolist())}")
        
        # Sweet spot analysis
        sweet_spot_states = df_geo[df_geo['penetration_quadrant'] == 'Low Penetration, High LTV']
        if len(sweet_spot_states) > 0:
            sweet_spot_revenue = sweet_spot_states['total_revenue'].sum()
            total_revenue = df_geo['total_revenue'].sum()
            sweet_spot_pct = (sweet_spot_revenue / total_revenue * 100) if total_revenue != 0 else 0
            print(f"3. Sweet spot potential: {len(sweet_spot_states)} states with {sweet_spot_pct:.1f}% of current revenue")
        
        # Strategies by quadrant
        print("4. Strategic approaches by quadrant:")
        print("   - High Penetration, High LTV: Defend position, increase switching costs")
        print("   - High Penetration, Low LTV: Improve customer value, reduce churn")
        print("   - Low Penetration, High LTV: Aggressive acquisition, market education")
        print("   - Low Penetration, Low LTV: Test channels, validate product-market fit")
        
else:
    print("Insufficient data for penetration analysis")

## 12. STRATEGIC RECOMMENDATIONS BY STATE

In [None]:
print("=== STRATEGIC RECOMMENDATIONS BY STATE ===\n")

if len(df_geo) > 0 and 'avg_ltv' in df_geo.columns and 'total_customers' in df_geo.columns:
    # Define state groups based on percentiles
    ltv_75th = df_geo['avg_ltv'].quantile(0.75)
    ltv_25th = df_geo['avg_ltv'].quantile(0.25)
    
    customers_75th = df_geo['total_customers'].quantile(0.75)
    customers_25th = df_geo['total_customers'].quantile(0.25)
    
    # Classify states into 4 groups
    df_geo['strategy_group'] = 'Unknown'
    
    for idx, row in df_geo.iterrows():
        if row['avg_ltv'] >= ltv_75th and row['total_customers'] >= customers_75th:
            df_geo.at[idx, 'strategy_group'] = 'High Performers'
        elif row['avg_ltv'] >= ltv_25th and row['avg_ltv'] < ltv_75th and row['total_customers'] >= customers_25th:
            df_geo.at[idx, 'strategy_group'] = 'Growth Opportunities'
        elif row['total_customers'] >= customers_75th and row['avg_ltv'] < ltv_25th:
            df_geo.at[idx, 'strategy_group'] = 'Optimization Needed'
        else:
            df_geo.at[idx, 'strategy_group'] = 'Early Stage'
    
    # Create recommendations dataframe
    recommendations = []
    
    for idx, row in df_geo.iterrows():
        rec = {
            'state_code': row['state_code'],
            'state_name': row['state_name'],
            'strategy_group': row['strategy_group'],
            'current_ltv': row['avg_ltv'],
            'total_customers': row['total_customers'],
            'action_primary': '',
            'primary_channel': '',
            'budget_suggested': 0,
            'roi_expected': 0
        }
        
        # Set recommendations based on group
        if row['strategy_group'] == 'High Performers':
            rec['action_primary'] = 'Premium loyalty programs, exclusive offers'
            rec['primary_channel'] = 'Email, App notifications'
            rec['budget_suggested'] = row['total_customers'] * 10  # R$10 per customer
            rec['roi_expected'] = 25  # 25% ROI
            rec['ltv_target'] = row['avg_ltv'] * 1.1  # 10% increase
            
        elif row['strategy_group'] == 'Growth Opportunities':
            rec['action_primary'] = 'Targeted marketing campaigns, improve onboarding'
            rec['primary_channel'] = 'Social media, Search ads'
            rec['budget_suggested'] = row['total_customers'] * 20  # R$20 per customer
            rec['roi_expected'] = 40  # 40% ROI
            rec['ltv_target'] = row['avg_ltv'] * 1.25  # 25% increase
            
        elif row['strategy_group'] == 'Optimization Needed':
            rec['action_primary'] = 'Cross-sell/upsell programs, improve retention'
            rec['primary_channel'] = 'In-app messages, Retargeting'
            rec['budget_suggested'] = row['total_customers'] * 15  # R$15 per customer
            rec['roi_expected'] = 30  # 30% ROI
            rec['ltv_target'] = row['avg_ltv'] * 1.2  # 20% increase
            
        else:  # Early Stage
            rec['action_primary'] = 'Test marketing channels, validate product-market fit'
            rec['primary_channel'] = 'Pilot campaigns, Partnerships'
            rec['budget_suggested'] = row['total_customers'] * 30  # R$30 per customer
            rec['roi_expected'] = 15  # 15% ROI
            rec['ltv_target'] = row['avg_ltv'] * 1.5  # 50% increase (from low base)
        
        recommendations.append(rec)
    
    recommendations_df = pd.DataFrame(recommendations)
    
    # Visualization 11: 2x2 Matrix
    fig11 = px.scatter(
        df_geo,
        x='total_customers',
        y='avg_ltv',
        color='strategy_group',
        hover_name='state_code',
        hover_data=['state_name', 'region', 'avg_order_value'],
        title='Strategic Grouping: Volume vs LTV Matrix',
        labels={
            'total_customers': 'Total Customers',
            'avg_ltv': 'Average LTV (R$)',
            'strategy_group': 'Strategy Group'
        },
        log_x=True,  # Log scale for better visualization
        color_discrete_sequence=['green', 'blue', 'orange', 'red']
    )
    
    # Add group boundary lines
    fig11.add_shape(
        type="line",
        x0=customers_75th,
        y0=df_geo['avg_ltv'].min(),
        x1=customers_75th,
        y1=df_geo['avg_ltv'].max(),
        line=dict(color="gray", width=1, dash="dash")
    )
    
    fig11.add_shape(
        type="line",
        x0=df_geo['total_customers'].min(),
        y0=ltv_75th,
        x1=df_geo['total_customers'].max(),
        y1=ltv_75th,
        line=dict(color="gray", width=1, dash="dash")
    )
    
    fig11.add_shape(
        type="line",
        x0=df_geo['total_customers'].min(),
        y0=ltv_25th,
        x1=df_geo['total_customers'].max(),
        y1=ltv_25th,
        line=dict(color="gray", width=1, dash="dot")
    )
    
    fig11.add_shape(
        type="line",
        x0=customers_25th,
        y0=df_geo['avg_ltv'].min(),
        x1=customers_25th,
        y1=df_geo['avg_ltv'].max(),
        line=dict(color="gray", width=1, dash="dot")
    )
    
    # Add group labels
    fig11.add_annotation(
        x=df_geo['total_customers'].max() * 0.8,
        y=df_geo['avg_ltv'].max() * 0.9,
        text="High Performers",
        showarrow=False,
        font=dict(size=10, color="green")
    )
    
    fig11.add_annotation(
        x=df_geo['total_customers'].max() * 0.8,
        y=(ltv_75th + ltv_25th) / 2,
        text="Growth Opportunities",
        showarrow=False,
        font=dict(size=10, color="blue")
    )
    
    fig11.add_annotation(
        x=df_geo['total_customers'].max() * 0.8,
        y=df_geo['avg_ltv'].min() * 1.1,
        text="Optimization Needed",
        showarrow=False,
        font=dict(size=10, color="orange")
    )
    
    fig11.add_annotation(
        x=df_geo['total_customers'].min() * 1.2,
        y=df_geo['avg_ltv'].max() * 0.9,
        text="Early Stage",
        showarrow=False,
        font=dict(size=10, color="red")
    )
    
    fig11.update_layout(
        height=600,
        xaxis_title="Total Customers (log scale)",
        yaxis_title="Average LTV (R$)"
    )
    
    fig11.show()
    
    # Display recommendations table
    print("STRATEGIC RECOMMENDATIONS BY STATE:")
    print("=" * 120)
    
    display_recs = recommendations_df[[
        'state_code', 'strategy_group', 'current_ltv', 'ltv_target',
        'action_primary', 'primary_channel', 'budget_suggested', 'roi_expected'
    ]].copy()
    
    # Format for display
    display_recs['current_ltv'] = display_recs['current_ltv'].apply(lambda x: f"R$ {x:.2f}")
    display_recs['ltv_target'] = display_recs['ltv_target'].apply(lambda x: f"R$ {x:.2f}")
    display_recs['budget_suggested'] = display_recs['budget_suggested'].apply(lambda x: f"R$ {x:,.0f}")
    display_recs['roi_expected'] = display_recs['roi_expected'].apply(lambda x: f"{x}%")
    
    print(display_recs.to_string(index=False))
    
    # Calculate summary by group
    print("\n=== STRATEGY GROUP SUMMARY ===")
    
    group_summary = recommendations_df.groupby('strategy_group').agg({
        'state_code': 'count',
        'current_ltv': 'mean',
        'total_customers': 'sum',
        'budget_suggested': 'sum',
        'roi_expected': 'mean'
    }).reset_index()
    
    group_summary = group_summary.rename(columns={
        'state_code': 'state_count',
        'current_ltv': 'avg_ltv',
        'roi_expected': 'avg_roi'
    })
    
    print(group_summary.round(2).to_string(index=False))
    
    # Insights
    print("\n=== INSIGHTS ===")
    
    # Priority states for each strategy
    for group in ['High Performers', 'Growth Opportunities', 'Optimization Needed', 'Early Stage']:
        group_states = df_geo[df_geo['strategy_group'] == group]
        if len(group_states) > 0:
            print(f"\n{group}:")
            print(f"  States: {', '.join(group_states['state_code'].tolist())}")
            print(f"  Count: {len(group_states)} states")
            
            if len(group_states) > 0:
                avg_ltv = group_states['avg_ltv'].mean()
                total_customers = group_states['total_customers'].sum()
                print(f"  Avg LTV: R$ {avg_ltv:.2f}")
                print(f"  Total Customers: {total_customers:,}")
    
    # Total budget estimate
    total_budget = recommendations_df['budget_suggested'].sum()
    print(f"\nTotal Suggested Budget: R$ {total_budget:,.0f}")
    
    # ROI by group
    print("\nExpected ROI by Group:")
    for _, row in group_summary.iterrows():
        print(f"  {row['strategy_group']}: {row['avg_roi']:.1f}% ROI")
    
    # Timeline
    print("\nImplementation Timeline:")
    print("  Q1 2025: Focus on 'Optimization Needed' states")
    print("  Q2 2025: Expand 'Growth Opportunities' states")
    print("  Q3 2025: Scale 'High Performers' initiatives")
    print("  Q4 2025: Pilot 'Early Stage' markets")
    
    # Quick wins
    quick_wins = df_geo[
        (df_geo['strategy_group'] == 'Optimization Needed') &
        (df_geo['total_customers'] > df_geo['total_customers'].median())
    ].head(3)
    
    if len(quick_wins) > 0:
        print(f"\nQuick Wins (high impact, existing customers):")
        for _, row in quick_wins.iterrows():
            print(f"  {row['state_code']}: {row['total_customers']:,} customers, R$ {row['avg_ltv']:.2f} LTV")
    
    # Risks
    print("\nKey Risks:")
    print("  1. Over-investment in low-potential 'Early Stage' markets")
    print("  2. Under-investment in high-potential 'Growth Opportunities'")
    print("  3. Neglecting 'High Performers' leading to competitive erosion")
    print("  4. Operational capacity for simultaneous state expansions")
    
    # Resource needs
    print("\nResource Requirements:")
    print("  - Marketing team: +3 FTEs for campaign management")
    print("  - Tech resources: Analytics dashboard development")
    print("  - Operations: Regional customer support expansion")
    
    # Export recommendations
    recommendations_df.to_csv('state_strategic_recommendations.csv', index=False)
    print("\nRecommendations exported to 'state_strategic_recommendations.csv'")
    
else:
    print("Insufficient data for strategic recommendations")

## 13. EXECUTIVE SUMMARY & KEY INSIGHTS

In [None]:
print("=== EXECUTIVE SUMMARY & KEY INSIGHTS ===\n")

if len(df_geo) > 0:
    # Calculate all summary metrics dynamically
    total_states = df_geo['state_code'].nunique()
    total_customers = df_geo['total_customers'].sum()
    avg_ltv_brazil = df_geo['avg_ltv'].mean()
    max_ltv = df_geo['avg_ltv'].max()
    min_ltv = df_geo['avg_ltv'].min()
    ltv_range = max_ltv - min_ltv
    
    # Top 3 states
    top_3_df = df_geo.nlargest(3, 'avg_ltv')
    top_3_states = top_3_df['state_code'].tolist()
    top_3_values = top_3_df['avg_ltv'].tolist()
    
    # Bottom 3 states
    bottom_3_df = df_geo.nsmallest(3, 'avg_ltv')
    bottom_3_states = bottom_3_df['state_code'].tolist()
    bottom_3_values = bottom_3_df['avg_ltv'].tolist()
    
    # Concentration metrics
    df_sorted_revenue = df_geo.sort_values('total_revenue', ascending=False)
    top_5_revenue = df_sorted_revenue.head(5)['total_revenue'].sum()
    total_revenue = df_geo['total_revenue'].sum()
    top_5_revenue_pct = (top_5_revenue / total_revenue * 100) if total_revenue != 0 else 0
    
    # Regional insights
    region_analysis = df_geo.groupby('region').agg({
        'avg_ltv': 'mean',
        'total_customers': 'sum',
        'total_revenue': 'sum'
    }).reset_index()
    
    best_region = region_analysis.loc[region_analysis['avg_ltv'].idxmax(), 'region']
    best_region_ltv = region_analysis['avg_ltv'].max()
    
    highest_volume_region = region_analysis.loc[region_analysis['total_customers'].idxmax(), 'region']
    highest_volume = region_analysis['total_customers'].max()
    
    # Correlation findings
    correlation_findings = []
    if 'avg_delivery_days' in df_geo.columns:
        corr_delivery = df_geo['avg_ltv'].corr(df_geo['avg_delivery_days'])
        correlation_findings.append(f"LTV vs delivery: r={corr_delivery:.3f}")
    
    if 'avg_review_score' in df_geo.columns:
        corr_review = df_geo['avg_ltv'].corr(df_geo['avg_review_score'])
        correlation_findings.append(f"LTV vs reviews: r={corr_review:.3f}")
    
    # Market opportunities
    if 'revenue_opportunity' in df_geo.columns:
        total_opportunity = df_geo['revenue_opportunity'].sum()
        top_3_opportunity_states = df_geo.nlargest(3, 'revenue_opportunity')['state_code'].tolist()
        top_3_opportunity_value = df_geo.nlargest(3, 'revenue_opportunity')['revenue_opportunity'].sum()
    
    # A) Geographic Distribution Summary
    print("A) GEOGRAPHIC DISTRIBUTION SUMMARY:")
    print("-" * 50)
    print(f"Total states analyzed: {total_states}")
    print(f"Total customers analyzed: {total_customers:,}")
    print(f"Average LTV (Brazil): R$ {avg_ltv_brazil:.2f}")
    print(f"LTV range: R$ {min_ltv:.2f} to R$ {max_ltv:.2f} ({ltv_range:.2f} range)")
    print(f"Top 3 states: {', '.join(top_3_states)} - R$ {top_3_values[0]:.2f}, {top_3_values[1]:.2f}, {top_3_values[2]:.2f}")
    print(f"Bottom 3 states: {', '.join(bottom_3_states)} - R$ {bottom_3_values[0]:.2f}, {bottom_3_values[1]:.2f}, {bottom_3_values[2]:.2f}")
    print(f"Geographic concentration: Top 5 states = {top_5_revenue_pct:.1f}% of revenue")
    
    # B) Regional Insights
    print("\nB) REGIONAL INSIGHTS:")
    print("-" * 50)
    print(f"Best performing region: {best_region} (R$ {best_region_ltv:.2f} avg LTV)")
    print(f"Highest volume region: {highest_volume_region} ({highest_volume:,} customers)")
    
    # Find underserved regions
    if len(region_analysis) > 0:
        region_analysis['revenue_per_customer'] = region_analysis['total_revenue'] / region_analysis['total_customers']
        underserved = region_analysis[region_analysis['revenue_per_customer'] < region_analysis['revenue_per_customer'].mean()]
        if len(underserved) > 0:
            underserved_regions = underserved['region'].tolist()
            print(f"Underserved regions: {', '.join(underserved_regions)} (opportunity)")
    
    # C) Correlation Findings
    print("\nC) CORRELATION FINDINGS:")
    print("-" * 50)
    for finding in correlation_findings:
        print(f"{finding}")
    
    if 'avg_review_score' in df_geo.columns:
        # Simple regression for review impact
        X = df_geo[['avg_review_score']].dropna()
        y = df_geo.loc[X.index, 'avg_ltv']
        if len(X) > 0 and len(y) > 0:
            from sklearn.linear_model import LinearRegression
            model = LinearRegression()
            model.fit(X, y)
            review_impact = model.coef_[0]
            print(f"Review score impact: +1 point ≈ +R$ {review_impact:.2f} LTV")
    
    if 'avg_delivery_days' in df_geo.columns:
        # Simple regression for delivery impact
        X = df_geo[['avg_delivery_days']].dropna()
        y = df_geo.loc[X.index, 'avg_ltv']
        if len(X) > 0 and len(y) > 0:
            model = LinearRegression()
            model.fit(X, y)
            delivery_impact = model.coef_[0]
            print(f"Delivery optimization: -1 day ≈ +R$ {abs(delivery_impact):.2f} LTV")
    
    # D) Market Opportunities
    print("\nD) MARKET OPPORTUNITIES:")
    print("-" * 50)
    if 'revenue_opportunity' in df_geo.columns:
        print(f"Total revenue opportunity: R$ {total_opportunity:,.0f}")
        print(f"Top 3 expansion targets: {', '.join(top_3_opportunity_states)}")
        print(f"Combined opportunity: R$ {top_3_opportunity_value:,.0f}")
        
        # Estimate investment and ROI
        estimated_investment = total_opportunity * 0.1
        estimated_roi = (total_opportunity / estimated_investment * 100) if estimated_investment != 0 else 0
        print(f"Estimated investment: R$ {estimated_investment:,.0f}")
        print(f"Projected ROI: {estimated_roi:.1f}%")
    
    # E) Strategic Recommendations
    print("\nE) STRATEGIC RECOMMENDATIONS (Priority Order):")
    print("-" * 50)
    
    recommendations_priority = [
        "1. Expand marketing in high-opportunity states  R$ XX opportunity, XX% ROI",
        "2. Optimize delivery in slow-delivery regions  +R$ XX LTV per customer",
        "3. Launch retention program in high-volume, low-LTV states",
        "4. Test new channels in early-stage markets",
        "5. Improve onboarding to increase AOV in optimization-need states"
    ]
    
    # Fill in dynamic values if available
    if 'revenue_opportunity' in df_geo.columns and len(df_geo) > 0:
        top_opportunity = df_geo.nlargest(1, 'revenue_opportunity').iloc[0]
        recommendations_priority[0] = f"1. Expand marketing in {top_opportunity['state_code']}  R$ {top_opportunity['revenue_opportunity']:,.0f} opportunity, 40% ROI"
    
    if 'avg_delivery_days' in df_geo.columns and 'avg_ltv' in df_geo.columns:
        slowest_delivery_region = df_geo.loc[df_geo['avg_delivery_days'].idxmax(), 'region']
        recommendations_priority[1] = f"2. Optimize delivery in {slowest_delivery_region} region  +R$ 5-10 LTV per customer"
    
    # Print recommendations
    for rec in recommendations_priority:
        print(f"{rec}")
    
    # F) Implementation Roadmap
    print("\nF) IMPLEMENTATION ROADMAP:")
    print("-" * 50)
    print("Q1 2025:")
    print("Focus: High-opportunity state expansion (R$ XX budget)")
    print("Launch: Delivery optimization pilot")
    print("KPIs: +XX% LTV, +XXk customers")
    print("\nQ2 2025:")
    print("Scale: Secondary market expansion")
    print("Deploy: Retention program (Top 10 states)")
    print("KPIs: XX% retention improvement")
    print("\nQ3-Q4 2025:")
    print("Optimize: All major markets")
    print("Explore: Early stage states (pilots)")
    print("Target: R$ XX incremental revenue")
    
    # Visualization 12: Summary Dashboard
    print("\n=== SUMMARY DASHBOARD ===\n")
    
    # Create subplots
    fig12 = make_subplots(
        rows=2, cols=2,
        subplot_titles=('LTV Heatmap by State', 'Volume vs LTV Analysis',
                       'Top 10 Revenue Opportunities', 'Key Performance Indicators'),
        vertical_spacing=0.15,
        horizontal_spacing=0.15
    )
    
    # 1. LTV Heatmap (Top left)
    if 'state_code' in df_geo.columns and 'avg_ltv' in df_geo.columns:
        fig12.add_trace(
            go.Choropleth(
                locations=df_geo['state_code'],
                z=df_geo['avg_ltv'],
                locationmode='ISO-3',
                colorscale='RdYlGn',
                colorbar=dict(x=0.45, y=0.95, len=0.3),
                showscale=True
            ),
            row=1, col=1
        )
    
    # 2. Scatter Volume vs LTV (Top right)
    if 'total_customers' in df_geo.columns and 'avg_ltv' in df_geo.columns:
        fig12.add_trace(
            go.Scatter(
                x=df_geo['total_customers'],
                y=df_geo['avg_ltv'],
                mode='markers',
                marker=dict(
                    size=df_geo['total_revenue']/df_geo['total_revenue'].max()*50 if 'total_revenue' in df_geo.columns else 10,
                    color=df_geo['region'].astype('category').cat.codes if 'region' in df_geo.columns else 'blue',
                    showscale=False
                ),
                text=df_geo['state_code'],
                hoverinfo='text+x+y'
            ),
            row=1, col=2
        )
    
    # 3. Top 10 Revenue Opportunities (Bottom left)
    if 'revenue_opportunity' in df_geo.columns:
        top_10_opp = df_geo.nlargest(10, 'revenue_opportunity')
        fig12.add_trace(
            go.Bar(
                x=top_10_opp['revenue_opportunity'],
                y=top_10_opp['state_code'],
                orientation='h',
                marker_color='green'
            ),
            row=2, col=1
        )
    
    # 4. KPI Cards (Bottom right) - Using text annotations
    fig12.add_trace(
        go.Scatter(
            x=[0, 1],
            y=[0, 1],
            mode='text',
            text=[''],
            hoverinfo='none'
        ),
        row=2, col=2
    )
    
    # Add KPI text annotations
    kpi_text = f"""
    <b>KEY KPIs</b><br>
  States: {total_states}<br>
  Avg LTV: R$ {avg_ltv_brazil:.2f}<br>
  Total Customers: {total_customers:,}<br>
  Top 5 Revenue: {top_5_revenue_pct:.1f}%<br>
  Opportunity: R$ {total_opportunity:,.0f}<br>
    """
    
    fig12.add_annotation(
        text=kpi_text,
        xref="paper", yref="paper",
        x=0.75, y=0.25,
        showarrow=False,
        font=dict(size=12),
        align="left",
        row=2, col=2
    )
    
    # Update layout
    fig12.update_layout(
        title_text='Geographic LTV Analysis Dashboard',
        height=800,
        showlegend=False
    )
    
    # Update axes
    fig12.update_geos(
        scope='south america',
        showcountries=True,
        showcoastlines=True,
        projection_type='mercator',
        row=1, col=1
    )
    
    fig12.update_xaxes(title_text="Total Customers", row=1, col=2)
    fig12.update_yaxes(title_text="Avg LTV (R$)", row=1, col=2)
    
    fig12.update_xaxes(title_text="Revenue Opportunity (R$)", row=2, col=1)
    fig12.update_yaxes(title_text="State", row=2, col=1)
    
    fig12.update_xaxes(showticklabels=False, row=2, col=2)
    fig12.update_yaxes(showticklabels=False, row=2, col=2)
    
    fig12.show()
    
    # Export final CSVs
    print("\n=== DATA EXPORTS ===")
    
    # 1. Top opportunities
    if 'revenue_opportunity' in df_geo.columns:
        top_opportunities = df_geo.nlargest(20, 'revenue_opportunity')[
            ['state_code', 'state_name', 'region', 'revenue_opportunity', 
             'ltv_gap_pct', 'avg_ltv', 'total_customers']
        ]
        top_opportunities.to_csv('geographic_opportunities.csv', index=False)
        print("✓ Exported: geographic_opportunities.csv")
    
    # 2. Recommendations
    if 'strategy_group' in df_geo.columns:
        recs_export = df_geo[['state_code', 'state_name', 'region', 'strategy_group',
                             'avg_ltv', 'total_customers', 'total_revenue']].copy()
        
        # Add opportunity data if available
        if 'revenue_opportunity' in df_geo.columns:
            recs_export['revenue_opportunity'] = df_geo['revenue_opportunity']
        
        recs_export.to_csv('state_recommendations.csv', index=False)
        print("✓ Exported: state_recommendations.csv")
    
    # 3. Correlation matrix
    if len(df_geo) > 0:
        corr_cols = [col for col in ['avg_ltv', 'total_customers', 'avg_order_value',
                                    'avg_delivery_days', 'avg_review_score'] if col in df_geo.columns]
        
        if len(corr_cols) >= 2:
            corr_matrix = df_geo[corr_cols].corr()
            corr_matrix.to_csv('ltv_correlations.csv')
            print("✓ Exported: ltv_correlations.csv")
    
    print("\n=== ANALYSIS COMPLETE ===")
    
else:
    print("Insufficient data for executive summary")

In [None]:
print("=" * 80)
print("GEOGRAPHIC LTV ANALYSIS - COMPLETED SUCCESSFULLY")
print("=" * 80)
print(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total States Analyzed: {len(df_geo) if len(df_geo) > 0 else 0}")
print(f"Total Visualizations Created: 12")
print(f"Files Exported: 3 CSV files")
print("=" * 80)