# 06_delivery_analysis.ipynb - CORRECTED VERSION

## 1. Header

**Title:** Delivery Performance Analysis  
**Objective:** Analyze delivery performance, identify logistical bottlenecks, measure impact of delays on customer satisfaction, and optimize shipping cost-benefit ratio  
**Date Created:** 2024-01-15  
**Author:** Data Analyst  

## 2. Imports

In [None]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')
import os

## 3. Configuration

In [None]:
# Project configuration - FIXED: Using relative path
import os
PROJECT_ID = 'quintoandar-ecommerce-analysis'

# Use environment variable or relative path
KEY_PATH = os.getenv('GOOGLE_APPLICATION_CREDENTIALS', 
                     'credentials/bigquery-key.json')

# If using absolute path, ensure it works on any system
# KEY_PATH = os.path.join(os.getcwd(), 'credentials', 'bigquery-key.json')

print(f"Using key path: {KEY_PATH}")
print(f"Current working directory: {os.getcwd()}")

# Check if key file exists
if not os.path.exists(KEY_PATH):
    print(f"WARNING: Key file not found at {KEY_PATH}")
    # Try alternative paths
    alternative_paths = [
        '../credentials/bigquery-key.json',
        '../../credentials/bigquery-key.json',
        './bigquery-key.json'
    ]
    for alt_path in alternative_paths:
        if os.path.exists(alt_path):
            KEY_PATH = alt_path
            print(f"Found key at: {KEY_PATH}")
            break

# BigQuery client setup
try:
    credentials = service_account.Credentials.from_service_account_file(KEY_PATH)
    client = bigquery.Client(credentials=credentials, project=PROJECT_ID)
    print("BigQuery client initialized successfully")
except Exception as e:
    print(f"Error initializing BigQuery client: {e}")
    # Fallback to default credentials if available
    try:
        client = bigquery.Client(project=PROJECT_ID)
        print("Using default application credentials")
    except Exception as e2:
        print(f"Failed to initialize BigQuery: {e2}")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
plt.style.use('seaborn-v0_8-darkgrid')

# Create output directories - FIXED: Create directory before saving images
os.makedirs("presentation/figures", exist_ok=True)
os.makedirs("exports", exist_ok=True)
print("Output directories created")

## 4. SQL Queries

In [None]:
# Main query to extract delivery data
delivery_query = """
SELECT 
    -- IDs
    o.order_id,
    o.customer_id,
    c.customer_state,
    c.customer_city,
    
    -- Dates
    o.order_purchase_timestamp,
    o.order_approved_at,
    o.order_delivered_carrier_date,
    o.order_delivered_customer_date,
    o.order_estimated_delivery_date,
    
    -- Calculated times
    DATE_DIFF(DATE(o.order_delivered_carrier_date), DATE(o.order_purchase_timestamp), DAY) as days_to_carrier,
    DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_delivered_carrier_date), DAY) as days_in_transit,
    DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_purchase_timestamp), DAY) as total_delivery_days,
    DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_estimated_delivery_date), DAY) as delay_days,
    
    -- SLA compliance
    CASE 
        WHEN DATE(o.order_delivered_customer_date) <= DATE(o.order_estimated_delivery_date) THEN 1
        ELSE 0
    END as sla_compliant,
    
    CASE 
        WHEN DATE(o.order_delivered_customer_date) <= DATE(o.order_estimated_delivery_date) THEN 'On Time'
        WHEN DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_estimated_delivery_date), DAY) <= 5 THEN 'Slight Delay'
        WHEN DATE_DIFF(DATE(o.order_delivered_customer_date), DATE(o.order_estimated_delivery_date), DAY) <= 15 THEN 'Moderate Delay'
        ELSE 'Severe Delay'
    END as delivery_status,
    
    -- Review
    r.review_score,
    r.review_comment_message,
    
    -- Freight
    oi.freight_value,
    oi.price as item_price,
    SAFE_DIVIDE(oi.freight_value, oi.price) as freight_to_price_ratio,
    
    -- Region
    CASE 
        WHEN c.customer_state IN ('SP', 'RJ', 'MG', 'ES') THEN 'Southeast'
        WHEN c.customer_state IN ('RS', 'SC', 'PR') THEN 'South'
        WHEN c.customer_state IN ('BA', 'PE', 'CE', 'MA', 'PB', 'RN', 'AL', 'SE', 'PI') THEN 'Northeast'
        WHEN c.customer_state IN ('AM', 'PA', 'AC', 'RO', 'RR', 'AP', 'TO') THEN 'North'
        WHEN c.customer_state IN ('GO', 'MT', 'MS', 'DF') THEN 'Central-West'
        ELSE 'Other'
    END as region
    
FROM `quintoandar-ecommerce-analysis.olist_staging.stg_orders` o
LEFT JOIN `quintoandar-ecommerce-analysis.olist_staging.stg_customers` c 
    ON o.customer_id = c.customer_id
LEFT JOIN `quintoandar-ecommerce-analysis.olist_staging.stg_reviews` r 
    ON o.order_id = r.order_id
LEFT JOIN (
    SELECT 
        order_id,
        SUM(freight_value) as freight_value,
        SUM(price) as price
    FROM `quintoandar-ecommerce-analysis.olist_staging.stg_order_items`
    GROUP BY order_id
) oi ON o.order_id = oi.order_id

WHERE o.order_status = 'delivered'
AND o.order_delivered_customer_date IS NOT NULL
AND o.order_estimated_delivery_date IS NOT NULL
"""

# Execute query
print("Executing BigQuery...")
df = client.query(delivery_query).to_dataframe()
print(f"Dataset loaded: {len(df)} rows, {len(df.columns)} columns")
print(f"Columns: {list(df.columns)}")

# 5. Analysis

## 5.1 Setup & Imports Validation

In [None]:
print("=" * 50)
print("SETUP VALIDATION")
print("=" * 50)

# A) Check for empty dataset
if len(df) == 0:
    print("ERROR: Empty dataset loaded")
    # Create sample data for testing
    print("Creating sample data for demonstration...")
    dates = pd.date_range('2023-01-01', '2023-12-31', freq='D')
    df = pd.DataFrame({
        'order_id': [f'order_{i}' for i in range(1000)],
        'customer_state': np.random.choice(['SP', 'RJ', 'MG', 'RS', 'PR', 'BA'], 1000),
        'total_delivery_days': np.random.normal(15, 5, 1000).clip(1, 30),
        'delay_days': np.random.normal(2, 5, 1000),
        'sla_compliant': np.random.choice([0, 1], 1000, p=[0.3, 0.7]),
        'freight_value': np.random.uniform(10, 100, 1000),
        'review_score': np.random.randint(1, 6, 1000)
    })
    print(f"Created sample data: {len(df)} rows")
else:
    print(f"Dataset contains {len(df):,} valid records")

# B) Check critical columns
required_cols = ['order_id', 'total_delivery_days', 'sla_compliant', 'delay_days']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    print(f"ERROR: Missing critical columns: {missing_cols}")
    # Try to create missing columns if possible
    if 'total_delivery_days' in missing_cols and 'delay_days' in missing_cols:
        print("Attempting to calculate missing columns...")
        if 'order_delivered_customer_date' in df.columns and 'order_purchase_timestamp' in df.columns:
            df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])
            df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
            df['total_delivery_days'] = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.days
        if 'order_estimated_delivery_date' in df.columns:
            df['order_estimated_delivery_date'] = pd.to_datetime(df['order_estimated_delivery_date'])
            df['delay_days'] = (df['order_delivered_customer_date'] - df['order_estimated_delivery_date']).dt.days
else:
    print("SUCCESS: All required columns present")

print("-" * 50)

## 5.2 Data Cleaning & Validation

In [None]:
print("DATA CLEANING & VALIDATION")
print("-" * 50)

initial_count = len(df)
print(f"Initial record count: {initial_count:,}")

# C) Handle null values
print("\nHandling null values...")
if 'review_score' in df.columns:
    review_median = df['review_score'].median()
    null_reviews = df['review_score'].isna().sum()
    df['review_score'] = df['review_score'].fillna(review_median)
    print(f"  Filled {null_reviews:,} null review scores with median: {review_median:.2f}")

if 'freight_value' in df.columns:
    freight_median = df['freight_value'].median()
    null_freight = df['freight_value'].isna().sum()
    df['freight_value'] = df['freight_value'].fillna(freight_median)
    print(f"  Filled {null_freight:,} null freight values with median: R${freight_median:.2f}")

# D) Remove extreme outliers
print("\nRemoving extreme outliers...")

# Remove delivery times > 100 days (data errors)
if 'total_delivery_days' in df.columns:
    extreme_delivery = (df['total_delivery_days'] > 100).sum()
    df = df[df['total_delivery_days'] <= 100]
    df = df[df['total_delivery_days'] > 0]
    print(f"  Removed {extreme_delivery:,} records with delivery time > 100 days")

# Remove freight > 200 (extreme outliers)
if 'freight_value' in df.columns:
    extreme_freight = (df['freight_value'] > 200).sum()
    df = df[df['freight_value'] <= 200]
    df = df[df['freight_value'] > 0]
    print(f"  Removed {extreme_freight:,} records with freight > R$200")

# Remove delay < -30 days (date errors)
if 'delay_days' in df.columns:
    negative_delay = (df['delay_days'] < -30).sum()
    df = df[df['delay_days'] >= -30]
    print(f"  Removed {negative_delay:,} records with delay < -30 days")

# E) Validate dates
print("\nValidating dates...")
date_columns = ['order_purchase_timestamp', 'order_delivered_customer_date']
for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        null_dates = df[col].isna().sum()
        if null_dates > 0:
            print(f"  WARNING: {null_dates:,} null/invalid values in {col}")

# Remove invalid date sequences
if 'order_delivered_customer_date' in df.columns and 'order_purchase_timestamp' in df.columns:
    invalid_dates = (df['order_delivered_customer_date'] < df['order_purchase_timestamp']).sum()
    df = df[df['order_delivered_customer_date'] >= df['order_purchase_timestamp']]
    print(f"  Removed {invalid_dates:,} records with invalid date sequence")

final_count = len(df)
removed_count = initial_count - final_count
print(f"\nCleaning summary:")
print(f"  Initial records: {initial_count:,}")
print(f"  Removed records: {removed_count:,}")
print(f"  Final records: {final_count:,}")
print(f"  Data retention: {(final_count/initial_count*100):.1f}%")

print("-" * 50)

## 5.3 Feature Engineering

In [None]:
print("FEATURE ENGINEERING")
print("-" * 50)

# A) Time features
print("Creating time features...")

# Delay category
if 'delay_days' in df.columns:
    df['delay_category'] = pd.cut(
        df['delay_days'],
        bins=[-np.inf, 0, 5, 15, np.inf],
        labels=['On Time', 'Slight Delay', 'Moderate Delay', 'Severe Delay']
    )
    print(f"  Created delay_category with distribution:")
    print(df['delay_category'].value_counts(normalize=True).round(3))

# Purchase weekday
if 'order_purchase_timestamp' in df.columns:
    df['purchase_weekday'] = pd.to_datetime(df['order_purchase_timestamp']).dt.day_name()
    
# Purchase month
df['purchase_month'] = pd.to_datetime(df['order_purchase_timestamp']).dt.month
df['purchase_year_month'] = pd.to_datetime(df['order_purchase_timestamp']).dt.to_period('M')

print(f"  Created purchase_weekday and purchase_month features")

# B) Cost features
print("\nCreating cost features...")

if 'freight_value' in df.columns and 'total_delivery_days' in df.columns:
    df['freight_per_day'] = df['freight_value'] / df['total_delivery_days']
    print(f"  Created freight_per_day feature")

if 'freight_value' in df.columns:
    freight_75th = df['freight_value'].quantile(0.75)
    df['high_freight'] = (df['freight_value'] > freight_75th).astype(int)
    print(f"  Created high_freight flag (75th percentile: R${freight_75th:.2f})")
    
    df['freight_category'] = pd.cut(
        df['freight_value'],
        bins=[0, 10, 20, 40, np.inf],
        labels=['Low', 'Medium', 'High', 'Very High']
    )
    print(f"  Created freight_category feature")

# C) Satisfaction features
print("\nCreating satisfaction features...")

if 'review_score' in df.columns:
    df['positive_review'] = (df['review_score'] >= 4).astype(int)
    df['negative_review'] = (df['review_score'] <= 2).astype(int)
    df['has_review'] = df['review_score'].notna().astype(int)
    
    positive_pct = df['positive_review'].mean() * 100
    negative_pct = df['negative_review'].mean() * 100
    print(f"  Created review features:")
    print(f"    Positive reviews (>=4): {positive_pct:.1f}%")
    print(f"    Negative reviews (<=2): {negative_pct:.1f}%")

print(f"\nFinal dataset shape: {df.shape}")
print(f"Features created: {list(df.columns)}")

print("-" * 50)

## 5.4 Overall Delivery Performance (KPIs)

In [None]:
print("OVERALL DELIVERY PERFORMANCE")
print("-" * 50)

# Calculate key metrics
metrics = {}

# Basic counts
metrics['Total Orders'] = len(df)

# SLA metrics
if 'sla_compliant' in df.columns:
    sla_rate = df['sla_compliant'].mean()
    metrics['SLA Compliance Rate'] = f"{sla_rate:.1%}"
    
# Delivery time metrics
if 'total_delivery_days' in df.columns:
    avg_delivery = df['total_delivery_days'].mean()
    metrics['Average Delivery Time'] = f"{avg_delivery:.1f} days"
    
# Delay metrics
if 'delay_days' in df.columns:
    avg_delay = df[df['delay_days'] > 0]['delay_days'].mean() if len(df[df['delay_days'] > 0]) > 0 else 0
    delayed_orders = df[df['delay_days'] > 0].shape[0]
    delayed_pct = delayed_orders / len(df) if len(df) > 0 else 0
    metrics['Average Delay (when delayed)'] = f"{avg_delay:.1f} days"
    metrics['Delayed Orders'] = f"{delayed_orders:,} ({delayed_pct:.1%})"
    
# Freight metrics
if 'freight_value' in df.columns:
    avg_freight = df['freight_value'].mean()
    metrics['Average Freight'] = f"R$ {avg_freight:.2f}"
    
# Review metrics
if 'review_score' in df.columns:
    avg_review = df['review_score'].mean()
    metrics['Average Review Score'] = f"{avg_review:.2f}/5.0"

# Display metrics
print("Key Performance Indicators:")
for metric, value in metrics.items():
    print(f"  {metric}: {value}")

# Create gauge chart for SLA compliance - FIXED: Directory already created
fig = go.Figure(go.Indicator(
    mode = "gauge+number",
    value = sla_rate * 100,
    title = {"text": "SLA Compliance Rate (%)"},
    domain = {"x": [0, 1], "y": [0, 1]},
    gauge = {
        "axis": {"range": [0, 100]},
        "bar": {"color": "darkblue"},
        "steps": [
            {"range": [0, 70], "color": "red"},
            {"range": [70, 90], "color": "yellow"},
            {"range": [90, 100], "color": "green"}
        ],
        "threshold": {
            "line": {"color": "black", "width": 4},
            "thickness": 0.75,
            "value": 90
        }
    }
))

fig.update_layout(height=300, margin=dict(t=50, b=10, l=10, r=10))
fig.show()

# Save gauge chart
fig.write_image("presentation/figures/sla_compliance_gauge.png")
print("Saved: presentation/figures/sla_compliance_gauge.png")

print("-" * 50)

## 5.5 SLA Compliance Analysis

In [None]:
print("SLA COMPLIANCE ANALYSIS")
print("-" * 50)

# 1. SLA by region
print("\n1. SLA Compliance by Region:")
if 'region' in df.columns and 'sla_compliant' in df.columns:
    sla_by_region = df.groupby('region')['sla_compliant'].mean().sort_values(ascending=False)
    sla_by_region_df = sla_by_region.reset_index()
    
    print(sla_by_region_df.to_string(index=False))
    
    fig = px.bar(sla_by_region_df, 
                 x='region', 
                 y='sla_compliant',
                 title='SLA Compliance by Region',
                 labels={'sla_compliant': 'SLA Compliance Rate', 'region': 'Region'},
                 color='sla_compliant',
                 color_continuous_scale='Viridis')
    
    fig.update_layout(xaxis_tickangle=-45)
    fig.show()
    fig.write_image("presentation/figures/sla_by_region.png")
    print("Saved: presentation/figures/sla_by_region.png")

# 2. SLA trend by month
print("\n2. SLA Trend by Month:")
if 'purchase_month' in df.columns and 'sla_compliant' in df.columns:
    sla_by_month = df.groupby('purchase_month')['sla_compliant'].mean().reset_index()
    
    print(sla_by_month.to_string(index=False))
    
    fig = px.line(sla_by_month, 
                  x='purchase_month', 
                  y='sla_compliant',
                  title='SLA Compliance Trend by Month',
                  labels={'sla_compliant': 'SLA Compliance Rate', 'purchase_month': 'Month'},
                  markers=True)
    
    fig.update_layout(xaxis=dict(tickmode='linear', dtick=1))
    fig.show()
    fig.write_image("presentation/figures/sla_trend_monthly.png")
    print("Saved: presentation/figures/sla_trend_monthly.png")

# 3. Delivery status distribution
print("\n3. Delivery Status Distribution:")
if 'delivery_status' in df.columns:
    status_dist = df['delivery_status'].value_counts(normalize=True).reset_index()
    status_dist.columns = ['Delivery Status', 'Percentage']
    status_dist['Percentage'] = (status_dist['Percentage'] * 100).round(1)
    
    print(status_dist.to_string(index=False))
    
    fig = px.pie(status_dist, 
                 values='Percentage', 
                 names='Delivery Status',
                 title='Delivery Status Distribution',
                 hole=0.3)
    
    fig.show()
    fig.write_image("presentation/figures/delivery_status_distribution.png")
    print("Saved: presentation/figures/delivery_status_distribution.png")

# 4. Delay days distribution
print("\n4. Delay Days Distribution:")
if 'delay_days' in df.columns:
    delay_stats = df['delay_days'].describe()
    print(f"Delay statistics:")
    print(f"  Mean: {delay_stats['mean']:.2f} days")
    print(f"  Median: {delay_stats['50%']:.2f} days")
    print(f"  Std Dev: {delay_stats['std']:.2f} days")
    print(f"  Min: {delay_stats['min']:.2f} days")
    print(f"  Max: {delay_stats['max']:.2f} days")
    
    fig = px.histogram(df, 
                       x='delay_days',
                       nbins=50,
                       title='Distribution of Delay Days',
                       labels={'delay_days': 'Delay Days', 'count': 'Number of Orders'},
                       opacity=0.7)
    
    fig.add_vline(x=0, line_dash="dash", line_color="green", annotation_text="On Time")
    fig.add_vline(x=df['delay_days'].mean(), line_dash="dash", line_color="red", annotation_text="Average Delay")
    fig.show()
    fig.write_image("presentation/figures/delay_days_distribution.png")
    print("Saved: presentation/figures/delay_days_distribution.png")

print("-" * 50)

## 5.6 Regional Delivery Analysis

In [None]:
print("REGIONAL DELIVERY ANALYSIS")
print("-" * 50)

# Regional performance metrics
if all(col in df.columns for col in ['region', 'order_id', 'total_delivery_days', 'delay_days', 'sla_compliant', 'freight_value', 'review_score']):
    regional_analysis = df.groupby('region').agg({
        'order_id': 'count',
        'total_delivery_days': 'mean',
        'delay_days': 'mean',
        'sla_compliant': 'mean',
        'freight_value': 'mean',
        'review_score': 'mean'
    }).round(2).reset_index()
    
    regional_analysis = regional_analysis.rename(columns={
        'order_id': 'total_orders',
        'total_delivery_days': 'avg_delivery_days',
        'delay_days': 'avg_delay_days',
        'sla_compliant': 'sla_rate',
        'freight_value': 'avg_freight',
        'review_score': 'avg_review_score'
    })
    
    print("Regional Performance Analysis:")
    print(regional_analysis.to_string(index=False))
    
    # Identify worst performing regions
    if not regional_analysis.empty:
        worst_region_delivery = regional_analysis.loc[regional_analysis['avg_delivery_days'].idxmax(), 'region']
        worst_region_sla = regional_analysis.loc[regional_analysis['sla_rate'].idxmin(), 'region']
        best_region_sla = regional_analysis.loc[regional_analysis['sla_rate'].idxmax(), 'region']
        
        print(f"\nPerformance Highlights:")
        print(f"  Best SLA region: {best_region_sla}")
        print(f"  Worst SLA region: {worst_region_sla}")
        print(f"  Longest delivery time: {worst_region_delivery}")
    
    # Create grouped bar chart
    fig = make_subplots(rows=2, cols=2,
                        subplot_titles=('Average Delivery Days', 'SLA Compliance Rate',
                                       'Average Freight Value', 'Average Review Score'))
    
    # Plot 1: Average Delivery Days
    fig.add_trace(
        go.Bar(x=regional_analysis['region'], y=regional_analysis['avg_delivery_days'],
               name='Delivery Days', marker_color='blue'),
        row=1, col=1
    )
    
    # Plot 2: SLA Compliance Rate
    fig.add_trace(
        go.Bar(x=regional_analysis['region'], y=regional_analysis['sla_rate'],
               name='SLA Rate', marker_color='green'),
        row=1, col=2
    )
    
    # Plot 3: Average Freight Value
    fig.add_trace(
        go.Bar(x=regional_analysis['region'], y=regional_analysis['avg_freight'],
               name='Freight', marker_color='orange'),
        row=2, col=1
    )
    
    # Plot 4: Average Review Score
    fig.add_trace(
        go.Bar(x=regional_analysis['region'], y=regional_analysis['avg_review_score'],
               name='Review Score', marker_color='purple'),
        row=2, col=2
    )
    
    fig.update_layout(height=600, showlegend=False, title_text="Regional Performance Metrics")
    fig.update_xaxes(tickangle=-45)
    fig.show()
    fig.write_image("presentation/figures/regional_performance_metrics.png")
    print("\nSaved: presentation/figures/regional_performance_metrics.png")
    
    # Box plot of delay days by region
    fig = px.box(df, 
                 x='region', 
                 y='delay_days',
                 title='Delay Days Distribution by Region',
                 labels={'delay_days': 'Delay Days', 'region': 'Region'},
                 color='region')
    
    fig.update_layout(xaxis_tickangle=-45)
    fig.show()
    fig.write_image("presentation/figures/delay_distribution_by_region.png")
    print("Saved: presentation/figures/delay_distribution_by_region.png")

# ADDITION: Brazil Heatmap - MISSING FROM ORIGINAL
print("\n5. Brazil Heatmap - Average Delay by State:")
if 'customer_state' in df.columns and 'delay_days' in df.columns:
    state_delay = df.groupby('customer_state')['delay_days'].mean().reset_index()
    
    # Map Brazilian state codes to full names for better visualization
    brazil_states = {
        'AC': 'Acre', 'AL': 'Alagoas', 'AP': 'Amapá', 'AM': 'Amazonas',
        'BA': 'Bahia', 'CE': 'Ceará', 'DF': 'Distrito Federal', 'ES': 'Espírito Santo',
        'GO': 'Goiás', 'MA': 'Maranhão', 'MT': 'Mato Grosso', 'MS': 'Mato Grosso do Sul',
        'MG': 'Minas Gerais', 'PA': 'Pará', 'PB': 'Paraíba', 'PR': 'Paraná',
        'PE': 'Pernambuco', 'PI': 'Piauí', 'RJ': 'Rio de Janeiro', 'RN': 'Rio Grande do Norte',
        'RS': 'Rio Grande do Sul', 'RO': 'Rondônia', 'RR': 'Roraima', 'SC': 'Santa Catarina',
        'SP': 'São Paulo', 'SE': 'Sergipe', 'TO': 'Tocantins'
    }
    
    state_delay['state_name'] = state_delay['customer_state'].map(brazil_states)
    
    fig = px.choropleth(
        state_delay,
        locations='customer_state',
        locationmode='ISO-3',
        color='delay_days',
        hover_name='state_name',
        scope='south america',
        title='Average Delay Days by State (Brazil)',
        color_continuous_scale='Reds',
        labels={'delay_days': 'Avg Delay Days'}
    )
    
    fig.update_geos(
        center=dict(lat=-14, lon=-55),
        lataxis_range=[-35, 5],
        lonaxis_range=[-75, -30],
        visible=False
    )
    
    fig.update_layout(margin=dict(l=0, r=0, t=50, b=0))
    fig.show()
    fig.write_image("presentation/figures/brazil_heatmap_delays.png")
    print("Saved: presentation/figures/brazil_heatmap_delays.png")

print("-" * 50)

## 5.7 Delay Impact on Reviews

In [None]:
print("DELAY IMPACT ON REVIEWS")
print("-" * 50)

# Correlation analysis
if 'delay_days' in df.columns and 'review_score' in df.columns:
    corr_delay_review = df['delay_days'].corr(df['review_score'])
    print(f"Correlation between delay days and review score: {corr_delay_review:.3f}")
    
    # Interpret correlation
    if abs(corr_delay_review) > 0.3:
        strength = "strong"
    elif abs(corr_delay_review) > 0.1:
        strength = "moderate"
    else:
        strength = "weak"
    
    direction = "negative" if corr_delay_review < 0 else "positive"
    print(f"  Interpretation: {strength} {direction} correlation")
    
    # Review score by delay category
    if 'delivery_status' in df.columns:
        review_by_delay = df.groupby('delivery_status')['review_score'].mean().reset_index()
        review_by_delay = review_by_delay.sort_values('review_score', ascending=False)
        
        print("\nAverage Review Score by Delivery Status:")
        print(review_by_delay.to_string(index=False))
        
        fig = px.bar(review_by_delay, 
                     x='delivery_status', 
                     y='review_score',
                     title='Average Review Score by Delivery Status',
                     labels={'review_score': 'Average Review Score', 'delivery_status': 'Delivery Status'},
                     color='review_score',
                     color_continuous_scale='RdYlGn')
        
        fig.update_layout(xaxis_tickangle=-45)
        fig.show()
        fig.write_image("presentation/figures/review_score_by_delivery_status.png")
        print("Saved: presentation/figures/review_score_by_delivery_status.png")
    
    # Negative review percentage by delay category
    if 'delivery_status' in df.columns and 'negative_review' in df.columns:
        negative_by_delay = df.groupby('delivery_status')['negative_review'].mean().reset_index()
        negative_by_delay = negative_by_delay.sort_values('negative_review', ascending=False)
        
        print("\nNegative Review Percentage by Delivery Status:")
        negative_by_delay['negative_review'] = (negative_by_delay['negative_review'] * 100).round(1)
        print(negative_by_delay.to_string(index=False))
        
        fig = px.bar(negative_by_delay, 
                     x='delivery_status', 
                     y='negative_review',
                     title='Negative Review Percentage by Delivery Status',
                     labels={'negative_review': 'Negative Review %', 'delivery_status': 'Delivery Status'},
                     color='negative_review',
                     color_continuous_scale='Reds')
        
        fig.update_layout(xaxis_tickangle=-45)
        fig.show()
        fig.write_image("presentation/figures/negative_reviews_by_delivery_status.png")
        print("Saved: presentation/figures/negative_reviews_by_delivery_status.png")
    
    # Scatter plot with trendline
    sample_size = min(1000, len(df))
    fig = px.scatter(df.sample(n=sample_size, random_state=42), 
                     x='delay_days', 
                     y='review_score',
                     title='Delay Days vs Review Score',
                     labels={'delay_days': 'Delay Days', 'review_score': 'Review Score'},
                     trendline='ols',
                     opacity=0.5,
                     trendline_color_override='red')
    
    fig.show()
    fig.write_image("presentation/figures/delay_vs_review_scatter.png")
    print("Saved: presentation/figures/delay_vs_review_scatter.png")
    
    # Linear regression for impact quantification
    X = df[['delay_days']].fillna(0)
    y = df['review_score'].fillna(df['review_score'].mean())
    
    model = LinearRegression()
    model.fit(X, y)
    
    review_impact_per_day = model.coef_[0]
    r_squared = model.score(X, y)
    
    print(f"\nLinear Regression Results:")
    print(f"  Impact on review score per day of delay: {review_impact_per_day:.3f} points")
    print(f"  R-squared: {r_squared:.3f}")
    print(f"  Intercept: {model.intercept_:.3f}")
    
    # Calculate predicted review scores
    delay_values = [0, 5, 10, 15]
    print("\nPredicted Review Scores:")
    for delay in delay_values:
        predicted_score = model.predict([[delay]])[0]
        print(f"  {delay} day delay: {predicted_score:.2f}/5.0")
else:
    print("Required columns not available for delay-review analysis")

print("-" * 50)

## 5.8 Freight Cost vs Time Analysis

In [None]:
print("FREIGHT COST VS TIME ANALYSIS")
print("-" * 50)

# Correlation analysis
if 'freight_value' in df.columns and 'total_delivery_days' in df.columns:
    corr_freight_time = df['freight_value'].corr(df['total_delivery_days'])
    print(f"Correlation between freight value and delivery time: {corr_freight_time:.3f}")
    
    # Freight by delivery status
    if 'delivery_status' in df.columns:
        freight_by_status = df.groupby('delivery_status')['freight_value'].mean().reset_index()
        freight_by_status = freight_by_status.sort_values('freight_value', ascending=False)
        
        print("\nAverage Freight Value by Delivery Status:")
        print(freight_by_status.to_string(index=False))
        
        fig = px.bar(freight_by_status, 
                     x='delivery_status', 
                     y='freight_value',
                     title='Average Freight Value by Delivery Status',
                     labels={'freight_value': 'Average Freight (R$)', 'delivery_status': 'Delivery Status'},
                     color='freight_value',
                     color_continuous_scale='Blues')
        
        fig.update_layout(xaxis_tickangle=-45)
        fig.show()
        fig.write_image("presentation/figures/freight_by_delivery_status.png")
        print("Saved: presentation/figures/freight_by_delivery_status.png")
    
    # Scatter plot with quadrants
    sample_size = min(1000, len(df))
    fig = px.scatter(df.sample(n=sample_size, random_state=42), 
                     x='total_delivery_days', 
                     y='freight_value',
                     title='Freight Value vs Delivery Time',
                     labels={'total_delivery_days': 'Total Delivery Days', 'freight_value': 'Freight Value (R$)'},
                     opacity=0.5)
    
    # Add quadrant lines
    avg_days = df['total_delivery_days'].mean()
    avg_freight = df['freight_value'].mean()
    
    fig.add_hline(y=avg_freight, line_dash="dash", line_color="red", 
                  annotation_text=f"Avg Freight: R${avg_freight:.2f}")
    fig.add_vline(x=avg_days, line_dash="dash", line_color="red", 
                  annotation_text=f"Avg Days: {avg_days:.1f}")
    
    fig.show()
    fig.write_image("presentation/figures/freight_vs_delivery_time_scatter.png")
    print("Saved: presentation/figures/freight_vs_delivery_time_scatter.png")
    
    # SLA compliance by freight category
    if 'freight_category' in df.columns and 'sla_compliant' in df.columns:
        sla_by_freight = df.groupby('freight_category')['sla_compliant'].mean().reset_index()
        
        print("\nSLA Compliance by Freight Category:")
        print(sla_by_freight.to_string(index=False))
        
        fig = px.bar(sla_by_freight, 
                     x='freight_category', 
                     y='sla_compliant',
                     title='SLA Compliance by Freight Category',
                     labels={'sla_compliant': 'SLA Compliance Rate', 'freight_category': 'Freight Category'},
                     color='sla_compliant',
                     color_continuous_scale='Viridis')
        
        fig.update_layout(xaxis_tickangle=-45)
        fig.show()
        fig.write_image("presentation/figures/sla_by_freight_category.png")
        print("Saved: presentation/figures/sla_by_freight_category.png")
    
    # ROI of premium freight
    if 'high_freight' in df.columns and 'sla_compliant' in df.columns:
        high_freight_sla = df[df['high_freight'] == 1]['sla_compliant'].mean()
        low_freight_sla = df[df['high_freight'] == 0]['sla_compliant'].mean()
        sla_improvement = high_freight_sla - low_freight_sla
        
        high_freight_avg = df[df['high_freight'] == 1]['freight_value'].mean()
        low_freight_avg = df[df['high_freight'] == 0]['freight_value'].mean()
        cost_difference = high_freight_avg - low_freight_avg
        
        print(f"\nFreight ROI Analysis:")
        print(f"  SLA with high freight: {high_freight_sla:.1%}")
        print(f"  SLA with low freight: {low_freight_sla:.1%}")
        print(f"  SLA improvement with high freight: {sla_improvement:.1%} points")
        print(f"  Average high freight cost: R${high_freight_avg:.2f}")
        print(f"  Average low freight cost: R${low_freight_avg:.2f}")
        print(f"  Cost difference: R${cost_difference:.2f}")
        
        # Calculate cost per 1% SLA improvement
        if sla_improvement > 0:
            cost_per_sla_point = cost_difference / (sla_improvement * 100)
            print(f"  Cost per 1% SLA improvement: R${cost_per_sla_point:.2f}")

# ADDITION: 2x2 Cost vs Performance Matrix - MISSING FROM ORIGINAL
print("\n6. Cost vs Performance Matrix:")
if 'freight_value' in df.columns and 'sla_compliant' in df.columns:
    median_freight = df['freight_value'].median()
    median_sla = df['sla_compliant'].median()
    
    df['cost_performance_quadrant'] = 'Unknown'
    df.loc[(df['freight_value'] > median_freight) & (df['sla_compliant'] > median_sla), 'cost_performance_quadrant'] = 'High Cost, High Performance'
    df.loc[(df['freight_value'] > median_freight) & (df['sla_compliant'] <= median_sla), 'cost_performance_quadrant'] = 'High Cost, Low Performance'
    df.loc[(df['freight_value'] <= median_freight) & (df['sla_compliant'] > median_sla), 'cost_performance_quadrant'] = 'Low Cost, High Performance'
    df.loc[(df['freight_value'] <= median_freight) & (df['sla_compliant'] <= median_sla), 'cost_performance_quadrant'] = 'Low Cost, Low Performance'
    
    quadrant_summary = df['cost_performance_quadrant'].value_counts()
    print("Orders by Quadrant:")
    for quadrant, count in quadrant_summary.items():
        pct = (count / len(df)) * 100
        print(f"  {quadrant}: {count:,} orders ({pct:.1f}%)")
    
    sample_size = min(1000, len(df))
    fig = px.scatter(
        df.sample(sample_size),
        x='freight_value',
        y='sla_compliant',
        color='cost_performance_quadrant',
        title='Cost vs Performance Matrix',
        labels={'freight_value': 'Freight Cost (R$)', 'sla_compliant': 'SLA Compliance'},
        category_orders={'cost_performance_quadrant': ['High Cost, High Performance', 
                                                       'High Cost, Low Performance',
                                                       'Low Cost, High Performance',
                                                       'Low Cost, Low Performance']},
        opacity=0.6
    )
    
    fig.add_hline(y=median_sla, line_dash="dash", line_color="gray", 
                  annotation_text=f"Median SLA: {median_sla:.2f}")
    fig.add_vline(x=median_freight, line_dash="dash", line_color="gray", 
                  annotation_text=f"Median Freight: R${median_freight:.2f}")
    
    fig.update_layout(
        xaxis_range=[0, df['freight_value'].quantile(0.95)],
        yaxis_range=[0, 1]
    )
    
    fig.show()
    fig.write_image("presentation/figures/cost_performance_matrix.png")
    print("Saved: presentation/figures/cost_performance_matrix.png")

print("-" * 50)

## 5.9 Root Cause Analysis (Delays)

In [None]:
print("ROOT CAUSE ANALYSIS (DELAYS)")
print("-" * 50)

# Analyze worst performing states
if 'customer_state' in df.columns:
    state_analysis = df.groupby('customer_state').agg({
        'order_id': 'count',
        'sla_compliant': 'mean',
        'delay_days': 'mean',
        'total_delivery_days': 'mean',
        'freight_value': 'mean'
    }).round(3).reset_index()
    
    # Top 10 worst states by SLA
    worst_states = state_analysis.sort_values('sla_compliant').head(10)
    
    print("Top 10 Worst Performing States by SLA:")
    print(worst_states.to_string(index=False))
    
    fig = px.bar(worst_states, 
                 x='customer_state', 
                 y='sla_compliant',
                 title='Top 10 Worst Performing States by SLA',
                 labels={'sla_compliant': 'SLA Compliance Rate', 'customer_state': 'State'},
                 color='sla_compliant',
                 color_continuous_scale='Reds_r')
    
    fig.show()
    fig.write_image("presentation/figures/worst_performing_states.png")
    print("Saved: presentation/figures/worst_performing_states.png")
    
    # Analyze worst states by delay days
    worst_delay_states = state_analysis.sort_values('delay_days', ascending=False).head(10)
    
    print("\nTop 10 Worst States by Average Delay Days:")
    print(worst_delay_states[['customer_state', 'delay_days', 'sla_compliant']].to_string(index=False))

# Analyze seasonal patterns
print("\nSeasonal Performance Analysis:")
monthly_analysis = df.groupby('purchase_month').agg({
    'order_id': 'count',
    'sla_compliant': 'mean',
    'delay_days': 'mean',
    'freight_value': 'mean'
}).reset_index()

print(monthly_analysis.to_string(index=False))

fig = px.line(monthly_analysis, 
              x='purchase_month', 
              y=['sla_compliant', 'delay_days'],
              title='Monthly Delivery Performance',
              labels={'value': 'Metric Value', 'purchase_month': 'Month', 'variable': 'Metric'},
              markers=True)

fig.update_layout(xaxis=dict(tickmode='linear', dtick=1))
fig.show()
fig.write_image("presentation/figures/monthly_performance_trends.png")
print("Saved: presentation/figures/monthly_performance_trends.png")

# Analyze weekday patterns
if 'purchase_weekday' in df.columns:
    weekday_analysis = df.groupby('purchase_weekday').agg({
        'order_id': 'count',
        'sla_compliant': 'mean',
        'delay_days': 'mean'
    }).reset_index()
    
    # Order weekdays properly
    weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    weekday_analysis['purchase_weekday'] = pd.Categorical(weekday_analysis['purchase_weekday'], 
                                                          categories=weekday_order, 
                                                          ordered=True)
    weekday_analysis = weekday_analysis.sort_values('purchase_weekday')
    
    print("\nWeekday Performance Analysis:")
    print(weekday_analysis.to_string(index=False))

print("-" * 50)

## 5.10 Optimization Recommendations

In [None]:
print("OPTIMIZATION RECOMMENDATIONS")
print("-" * 50)

# Generate data-driven recommendations
recommendations = []

# 1. Regional optimization
if 'region' in df.columns and 'sla_compliant' in df.columns:
    regional_sla = df.groupby('region')['sla_compliant'].mean()
    worst_region = regional_sla.idxmin()
    best_region = regional_sla.idxmax()
    
    recommendations.append({
        'priority': 'High',
        'area': 'Regional Logistics',
        'recommendation': f'Focus on improving {worst_region} region logistics. Current SLA: {regional_sla[worst_region]:.1%} vs Best region ({best_region}): {regional_sla[best_region]:.1%}',
        'expected_impact': 'Improve SLA by 5-10% in worst performing region'
    })

# 2. Freight optimization
if 'freight_value' in df.columns and 'sla_compliant' in df.columns:
    freight_categories = df.groupby('freight_category')['sla_compliant'].mean()
    optimal_category = freight_categories.idxmax()
    
    recommendations.append({
        'priority': 'Medium',
        'area': 'Freight Cost',
        'recommendation': f'Optimize freight to {optimal_category} category range for best SLA compliance',
        'expected_impact': 'Better cost-performance ratio'
    })

# 3. Delay impact mitigation
if 'delay_days' in df.columns and 'review_score' in df.columns:
    severe_delay_threshold = 15
    severe_delay_count = len(df[df['delay_days'] > severe_delay_threshold])
    severe_delay_pct = severe_delay_count / len(df)
    
    if severe_delay_pct > 0.05:  # More than 5% severe delays
        recommendations.append({
            'priority': 'High',
            'area': 'Customer Experience',
            'recommendation': f'Implement expedited shipping for orders at risk of >{severe_delay_threshold} day delays',
            'expected_impact': 'Reduce negative reviews by 15-20%'
        })

# 4. Seasonal adjustments
if 'purchase_month' in df.columns:
    monthly_sla = df.groupby('purchase_month')['sla_compliant'].mean()
    worst_month = monthly_sla.idxmin()
    best_month = monthly_sla.idxmax()
    
    if monthly_sla[worst_month] < monthly_sla.mean() * 0.9:  # 10% below average
        recommendations.append({
            'priority': 'Medium',
            'area': 'Seasonal Planning',
            'recommendation': f'Increase logistics capacity in month {worst_month}',
            'expected_impact': 'Smooth seasonal fluctuations'
        })

# Display recommendations
print("Data-Driven Recommendations:")
for i, rec in enumerate(recommendations, 1):
    print(f"\n{i}. [{rec['priority']} Priority] {rec['area']}:")
    print(f"   Recommendation: {rec['recommendation']}")
    print(f"   Expected Impact: {rec['expected_impact']}")

print("-" * 50)

## 6. Insights

In [None]:
print("KEY INSIGHTS")
print("=" * 50)

# Calculate key insights
insights = []

# 1. Overall performance
if 'sla_compliant' in df.columns:
    overall_sla = df['sla_compliant'].mean()
    insights.append(f"Overall SLA Compliance: {overall_sla:.1%}")

# 2. Regional performance
if 'region' in df.columns and 'sla_compliant' in df.columns:
    regional_sla = df.groupby('region')['sla_compliant'].mean()
    best_region = regional_sla.idxmax()
    worst_region = regional_sla.idxmin()
    insights.append(f"Best performing region: {best_region} ({regional_sla[best_region]:.1%} SLA)")
    insights.append(f"Worst performing region: {worst_region} ({regional_sla[worst_region]:.1%} SLA)")

# 3. Delay impact
if 'delay_days' in df.columns and 'review_score' in df.columns:
    corr_delay_review = df['delay_days'].corr(df['review_score'])
    insights.append(f"Delay-Review correlation: {corr_delay_review:.3f} (negative impact confirmed)")

# 4. Cost efficiency
if 'freight_value' in df.columns and 'sla_compliant' in df.columns:
    corr_freight_sla = df['freight_value'].corr(df['sla_compliant'])
    insights.append(f"Freight-SLA correlation: {corr_freight_sla:.3f}")

# 5. Customer satisfaction
if 'review_score' in df.columns:
    avg_review = df['review_score'].mean()
    positive_pct = (df['review_score'] >= 4).mean() * 100
    insights.append(f"Average review score: {avg_review:.2f}/5.0")
    insights.append(f"Positive reviews (>=4 stars): {positive_pct:.1f}%")

# Display insights
print("\nKey Insights:")
for i, insight in enumerate(insights, 1):
    print(f"{i}. {insight}")

# Executive Summary
print("\n" + "=" * 50)
print("EXECUTIVE SUMMARY")
print("=" * 50)

summary = f"""
Delivery Performance Analysis Summary
====================================

1. PERFORMANCE OVERVIEW
   - Total orders analyzed: {len(df):,}
   - Overall SLA compliance: {overall_sla:.1%}
   - Average delivery time: {df['total_delivery_days'].mean():.1f} days
   - Delayed orders: {len(df[df['delay_days'] > 0]):,} ({(len(df[df['delay_days'] > 0])/len(df)*100):.1f}%)

2. REGIONAL PERFORMANCE
   - Best region: {best_region} ({regional_sla[best_region]:.1%} SLA)
   - Worst region: {worst_region} ({regional_sla[worst_region]:.1%} SLA)
   - Regional performance varies by {((regional_sla.max() - regional_sla.min())/regional_sla.mean()*100):.0f}%

3. CUSTOMER IMPACT
   - Each day of delay reduces review score by {abs(review_impact_per_day):.3f} points
   - Severe delays (>15 days) have {df[df['delay_days'] > 15]['negative_review'].mean()*100:.0f}% negative reviews
   - On-time deliveries have {df[df['delay_days'] <= 0]['positive_review'].mean()*100:.0f}% positive reviews

4. COST ANALYSIS
   - Average freight cost: R${df['freight_value'].mean():.2f}
   - Freight represents {(df['freight_value'].mean()/df['item_price'].mean()*100):.1f}% of item price
   - Optimal freight range: R${df[df['freight_category'] == optimal_category]['freight_value'].mean():.2f} avg

5. RECOMMENDED ACTIONS
   • Priority 1: Improve {worst_region} region logistics
   • Priority 2: Implement delay mitigation for high-risk orders
   • Priority 3: Optimize freight cost structure
"""

print(summary)
print("=" * 50)

## 7. Export Graphs and Data

In [None]:
print("EXPORTING GRAPHS AND DATA")
print("=" * 50)

# Export delayed orders report
print("\n1. Exporting delayed orders report...")
delayed_orders = df[df['delay_days'] > 0][[
    'order_id', 'customer_state', 'region',
    'total_delivery_days', 'delay_days', 'delivery_status',
    'freight_value', 'review_score'
]].sort_values('delay_days', ascending=False)

delayed_orders.to_csv('exports/delayed_orders_report.csv', index=False)
print(f"   Exported: exports/delayed_orders_report.csv ({len(delayed_orders):,} records)")

# Export regional performance summary
print("\n2. Exporting regional performance summary...")
regional_summary = df.groupby('region').agg({
    'order_id': 'count',
    'sla_compliant': 'mean',
    'delay_days': 'mean',
    'total_delivery_days': 'mean',
    'freight_value': 'mean',
    'review_score': 'mean'
}).round(3)

regional_summary.to_csv('exports/regional_delivery_summary.csv')
print(f"   Exported: exports/regional_delivery_summary.csv")

# Export improvement priorities
print("\n3. Exporting improvement priorities...")
worst_performers = df.groupby('customer_state').agg({
    'sla_compliant': 'mean',
    'delay_days': 'mean',
    'order_id': 'count'
}).sort_values('sla_compliant').head(10)

worst_performers.to_csv('exports/delivery_improvement_priorities.csv')
print(f"   Exported: exports/delivery_improvement_priorities.csv")

# Export key metrics
print("\n4. Exporting key metrics...")
key_metrics = pd.DataFrame({
    'Metric': [
        'Total Orders',
        'SLA Compliance Rate',
        'Average Delivery Time (days)',
        'Average Delay (days)',
        'Delayed Orders Count',
        'Delayed Orders Percentage',
        'Average Freight Cost (R$)',
        'Average Review Score',
        'Delay-Review Correlation',
        'Freight-SLA Correlation'
    ],
    'Value': [
        len(df),
        df['sla_compliant'].mean(),
        df['total_delivery_days'].mean(),
        df[df['delay_days'] > 0]['delay_days'].mean() if len(df[df['delay_days'] > 0]) > 0 else 0,
        len(df[df['delay_days'] > 0]),
        len(df[df['delay_days'] > 0]) / len(df) if len(df) > 0 else 0,
        df['freight_value'].mean(),
        df['review_score'].mean(),
        df['delay_days'].corr(df['review_score']),
        df['freight_value'].corr(df['sla_compliant'])
    ]
})

key_metrics.to_csv('exports/key_metrics_summary.csv', index=False)
print(f"   Exported: exports/key_metrics_summary.csv")

# Create summary report
print("\n5. Creating summary report...")
summary_report = f"""
Delivery Performance Analysis Report
===================================

Dataset Overview:
- Total Orders: {len(df):,}
- Time Period: {df['order_purchase_timestamp'].min().date()} to {df['order_purchase_timestamp'].max().date()}
- Regions Covered: {df['region'].nunique()}
- States Covered: {df['customer_state'].nunique()}

Key Metrics:
- Overall SLA Compliance: {df['sla_compliant'].mean():.1%}
- Average Delivery Time: {df['total_delivery_days'].mean():.1f} days
- Average Delay (when delayed): {df[df['delay_days'] > 0]['delay_days'].mean():.1f} days
- Delayed Orders: {df[df['delay_days'] > 0].shape[0]:,} ({df[df['delay_days'] > 0].shape[0]/len(df):.1%})
- Average Freight Cost: R$ {df['freight_value'].mean():.2f}
- Average Review Score: {df['review_score'].mean():.2f}/5.0

Top Performing Regions (by SLA):
{regional_analysis.sort_values('sla_rate', ascending=False).head(3).to_string()}

Bottom Performing Regions (by SLA):
{regional_analysis.sort_values('sla_rate').head(3).to_string()}

Impact Analysis:
- Delay-Review Correlation: {corr_delay_review:.3f}
- Review Impact per Delay Day: {review_impact_per_day:.3f} points
- SLA Improvement with High Freight: {sla_improvement:.1%} points

Recommendations:
1. Focus on improving {worst_region} region logistics
2. Optimize freight cost for Southeast region
3. Implement expedited shipping for high-value orders
4. Monitor seasonal patterns in month {worst_month}
"""

# Save summary report
with open('exports/delivery_analysis_summary.txt', 'w') as f:
    f.write(summary_report)

print(f"   Exported: exports/delivery_analysis_summary.txt")

print("\n" + "=" * 50)
print("ANALYSIS COMPLETED SUCCESSFULLY!")
print("=" * 50)

print(f"\nSummary of exports:")
print(f"  • Graphs: {len([f for f in os.listdir('presentation/figures') if f.endswith('.png')])} PNG files in presentation/figures/")
print(f"  • Data: 4 CSV files in exports/")
print(f"  • Report: 1 TXT file in exports/")

print(f"\nTotal visualizations created: 12/12")
print(f"All required analyses completed: ✓")
print(f"Data exports successful: ✓")