# Data Visualization

To load dependencies, run:
```bash
uv sync --extra analysis
```

In [None]:
import duckdb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from pathlib import Path

# Set stylesns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
# Connect to database
db_path = Path("data/aida_challenge.duckdb").absolute()
con = duckdb.connect(str(db_path))
print(f"✓ Connected to database at {db_path}")

In [None]:
# Create output directory for saving visualizations
output_dir = Path("docs/images/visualizations")
output_dir.mkdir(parents=True, exist_ok=True)
print(f"✓ Output directory created: {output_dir}")

## 1. Customer Demographics
Understanding our customer base through Age, Income, and Profession distribution.

In [None]:
# Load customer data
df_customers = con.execute(
    """
    SELECT 
        eta as Age,
        reddito as Income,
        professione as Profession,
        luogo_residenza as City
    FROM aida_challenge.main_staging.stg_clienti
"""
).df()

# Create subplots for Age and Income
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Age Distribution
sns.histplot(data=df_customers, x="Age", bins=30, kde=True, ax=axes[0], color="#3182bd")
axes[0].set_title("Customer Age Distribution")

# Income Distribution
sns.histplot(data=df_customers, x="Income", bins=30, kde=True, ax=axes[1], color="#756bb1")
axes[1].set_title("Customer Income Distribution")

plt.tight_layout()
plt.savefig(output_dir / "01_customer_demographics_age_income.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Profession Count
plt.figure(figsize=(12, 6))
profession_counts = df_customers["Profession"].value_counts().head(10)
sns.barplot(
    x=profession_counts.values,
    y=profession_counts.index,
    palette="viridis",
    legend=False,
    hue=profession_counts.index,
)
plt.title("Top 10 Professions")
plt.xlabel("Count")
plt.tight_layout()
plt.savefig(output_dir / "02_customer_demographics_professions.png", dpi=300, bbox_inches="tight")
plt.show()

## 2. Portfolio Analysis
Analyzing the distribution of products and premiums.

In [None]:
# Load policy data
df_policies = con.execute(
    """
    SELECT 
        prodotto as Product,
        area_bisogno as Need_Area,
        premio_totale_annuo as Annual_Premium
    FROM aida_challenge.main_staging.stg_polizze
"""
).df()

# Premium by Need Area
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=df_policies,
    x="Need_Area",
    y="Annual_Premium",
    hue="Need_Area",
    palette="cividis",
    legend=False,
)
plt.title("Annual Premium Distribution by Need Area")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(output_dir / "03_portfolio_premium_by_need_area.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Total Premium by Product
product_premium = (
    df_policies.groupby("Product")["Annual_Premium"].sum().sort_values(ascending=False)
)
plt.figure(figsize=(12, 6))
sns.barplot(
    x=product_premium.values,
    y=product_premium.index,
    palette="plasma",
    hue=product_premium.index,
    legend=False,
)
plt.title("Total Annual Premium by Product")
plt.xlabel("Total Premium (€)")
plt.tight_layout()
plt.savefig(output_dir / "04_portfolio_total_premium_by_product.png", dpi=300, bbox_inches="tight")
plt.show()

## 3. Customer Value & Risk
Exploring the relationship between Engagement, Churn Probability, and Customer Lifetime Value (CLV).

In [None]:
# Load value metrics
df_value = con.execute(
    """
    SELECT 
        engagement_score,
        churn_probability,
        clv_stimato as CLV,
        cluster_risposta as Cluster
    FROM aida_challenge.main_staging.stg_clienti
    WHERE engagement_score IS NOT NULL
"""
).df()

# Scatter plot: Engagement vs Churn with protan-friendly colors
fig, ax = plt.subplots(figsize=(12, 7))
clusters = df_value["Cluster"].unique()

# Okabe-Ito colorblind-safe palette (excellent for protan)
colors = ["#0173B2", "#F0E442", "#56B4E9", "#D55E00", "#CC79A7", "#009E73", "#000000"]

for i, cluster in enumerate(sorted(clusters)):
    cluster_data = df_value[df_value["Cluster"] == cluster]
    ax.scatter(
        cluster_data["engagement_score"],
        cluster_data["churn_probability"],
        color=colors[i % len(colors)],
        s=100,
        alpha=0.6,
        label=f"{cluster}",
        edgecolors="white",
        linewidths=1.5,
    )

ax.set_xlabel("Engagement Score", fontsize=12, fontweight="bold")
ax.set_ylabel("Churn Probability", fontsize=12, fontweight="bold")
ax.set_title("Engagement Score vs. Churn Probability", fontsize=14, fontweight="bold")
ax.legend(title="Cluster", loc="upper right", frameon=True, shadow=True)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(output_dir / "05_customer_value_engagement_vs_churn.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# CLV Distribution by Cluster
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_value, x="Cluster", y="CLV", hue="Cluster", palette="colorblind", legend=False)
plt.title("Customer Lifetime Value by Cluster")
plt.tight_layout()
plt.savefig(output_dir / "06_customer_value_clv_by_cluster.png", dpi=300, bbox_inches="tight")
plt.show()

## 4. Geographic Distribution
Mapping customer locations with density heatmap.

In [None]:
# Load customer geographic data
df_geo = con.execute(
    """
    SELECT 
        codice_cliente,
        latitudine as lat,
        longitudine as lon,
        luogo_residenza as city,
        clv_stimato as clv
    FROM aida_challenge.main_staging.stg_clienti
    WHERE latitudine IS NOT NULL 
        AND longitudine IS NOT NULL
"""
).df()

print(f"Loaded {len(df_geo):,} customers with location data")
df_geo.head()

In [None]:
fig = px.scatter_map(
    df_geo.sample(min(7000, len(df_geo))),  # Sample for performance
    lat="lat",
    lon="lon",
    color="clv",
    size="clv",
    hover_data=["city", "clv"],
    color_continuous_scale="Plasma",
    size_max=15,
    zoom=5,
    map_style="open-street-map",
    title="Customer Locations by CLV",
    height=600,
)

fig.update_layout(
    margin={"r": 0, "t": 40, "l": 0, "b": 0}, coloraxis_colorbar=dict(title="CLV (€)")
)

fig.write_image(output_dir / "07_geographic_customer_locations.png", width=1200, height=600)
fig.show()

## 5. Product Performance & Profitability
Analyzing product profitability, loss ratios, and portfolio composition.


In [None]:
# Load policy financial metrics
df_product_perf = con.execute(
    """
    SELECT 
        prodotto as Product,
        area_bisogno as Need_Area,
        AVG(loss_ratio) as Avg_Loss_Ratio,
        SUM(premio_totale_annuo) as Total_Premium,
        SUM(margine_lordo) as Total_Margin,
        COUNT(*) as Policy_Count,
        AVG(premio_totale_annuo) as Avg_Premium
    FROM aida_challenge.main_staging.stg_polizze
    WHERE stato_polizza = 'Attiva'
    GROUP BY prodotto, area_bisogno
    ORDER BY Total_Premium DESC
"""
).df()

# Loss Ratio by Product
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Loss Ratio
top_products = df_product_perf.nlargest(10, "Total_Premium")
sns.barplot(
    data=top_products,
    y="Product",
    x="Avg_Loss_Ratio",
    hue="Product",
    palette="viridis",
    legend=False,
    ax=axes[0],
)
axes[0].set_title("Loss Ratio by Top 10 Products")
axes[0].set_xlabel("Average Loss Ratio")
# axes[0].axvline(x=0.7, color="red", linestyle="--", label="Target (70%)")
axes[0].legend()

# Total Margin
sns.barplot(
    data=top_products,
    y="Product",
    x="Total_Margin",
    hue="Product",
    palette="plasma",
    legend=False,
    ax=axes[1],
)
axes[1].set_title("Total Margin by Top 10 Products")
axes[1].set_xlabel("Total Margin (€)")

plt.tight_layout()
plt.savefig(
    output_dir / "08_product_performance_loss_ratio_margin.png", dpi=300, bbox_inches="tight"
)
plt.show()

## 6. Customer Lifecycle & Retention
Understanding customer tenure, churn patterns, and engagement over time.


In [None]:
# Load customer lifecycle data
df_lifecycle = con.execute(
    """
    SELECT 
        anzianita_compagnia as Tenure_Years,
        engagement_score,
        churn_probability,
        num_polizze as Policy_Count,
        visite_ultimo_anno as Annual_Visits,
        CASE 
            WHEN anzianita_compagnia < 2 THEN 'New (0-2y)'
            WHEN anzianita_compagnia < 5 THEN 'Growing (2-5y)'
            WHEN anzianita_compagnia < 10 THEN 'Mature (5-10y)'
            ELSE 'Loyal (10y+)'
        END as Lifecycle_Stage
    FROM aida_challenge.main_staging.stg_clienti
    WHERE engagement_score IS NOT NULL
"""
).df()

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Tenure distribution
sns.histplot(data=df_lifecycle, x="Tenure_Years", bins=20, kde=True, ax=axes[0, 0], color="#0173B2")
axes[0, 0].set_title("Customer Tenure Distribution")
axes[0, 0].set_xlabel("Years with Company")

# Churn by Lifecycle Stage
sns.boxplot(
    data=df_lifecycle,
    x="Lifecycle_Stage",
    y="churn_probability",
    hue="Lifecycle_Stage",
    palette="cividis",
    legend=False,
    ax=axes[0, 1],
)
axes[0, 1].set_title("Churn Probability by Lifecycle Stage")
axes[0, 1].set_xticklabels(axes[0, 1].get_xticklabels(), rotation=45)

# Engagement by Tenure
sns.scatterplot(
    data=df_lifecycle,
    x="Tenure_Years",
    y="engagement_score",
    size="Policy_Count",
    hue="Policy_Count",
    palette="viridis",
    sizes=(20, 200),
    alpha=0.6,
    ax=axes[1, 0],
)
axes[1, 0].set_title("Engagement Score vs Tenure")
axes[1, 0].set_xlabel("Years with Company")

# Annual Visits by Stage
sns.violinplot(
    data=df_lifecycle,
    x="Lifecycle_Stage",
    y="Annual_Visits",
    hue="Lifecycle_Stage",
    palette="plasma",
    legend=False,
    ax=axes[1, 1],
)
axes[1, 1].set_title("Annual Visits by Lifecycle Stage")
axes[1, 1].set_xticklabels(axes[1, 1].get_xticklabels(), rotation=45)

plt.tight_layout()
plt.savefig(output_dir / "09_customer_lifecycle_retention.png", dpi=300, bbox_inches="tight")
plt.show()

## 7. Channel Performance & Customer Acquisition
Analyzing acquisition channels, conversion rates, and interaction patterns.


In [None]:
# Load channel and interaction data
df_channels = con.execute(
    """
    SELECT 
        p.canale_acquisizione as Channel,
        COUNT(DISTINCT p.codice_cliente) as Customer_Count,
        AVG(c.clv_stimato) as Avg_CLV,
        AVG(c.engagement_score) as Avg_Engagement,
        AVG(p.premio_totale_annuo) as Avg_Premium,
        SUM(p.premio_totale_annuo) as Total_Revenue
    FROM aida_challenge.main_staging.stg_polizze p
    JOIN aida_challenge.main_staging.stg_clienti c 
        ON p.codice_cliente = c.codice_cliente
    WHERE p.stato_polizza = 'Attiva'
    GROUP BY p.canale_acquisizione
    ORDER BY Total_Revenue DESC
"""
).df()

df_interactions = con.execute(
    """
    SELECT 
        tipo_interazione as Interaction_Type,
        COUNT(*) as Interaction_Count,
        AVG(durata_minuti) as Avg_Duration,
        SUM(CASE WHEN conversione THEN 1 ELSE 0 END)::FLOAT / COUNT(*) * 100 as Conversion_Rate
    FROM aida_challenge.main_staging.stg_interazioni_clienti
    GROUP BY tipo_interazione
    ORDER BY Interaction_Count DESC
"""
).df()

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Channel Revenue
sns.barplot(
    data=df_channels,
    y="Channel",
    x="Total_Revenue",
    hue="Channel",
    palette="viridis",
    legend=False,
    ax=axes[0, 0],
)
axes[0, 0].set_title("Total Revenue by Acquisition Channel")
axes[0, 0].set_xlabel("Total Revenue (€)")

# CLV by Channel
sns.barplot(
    data=df_channels,
    y="Channel",
    x="Avg_CLV",
    hue="Channel",
    palette="cividis",
    legend=False,
    ax=axes[0, 1],
)
axes[0, 1].set_title("Average CLV by Channel")
axes[0, 1].set_xlabel("Average CLV (€)")

# Interaction Volume
sns.barplot(
    data=df_interactions,
    x="Interaction_Count",
    y="Interaction_Type",
    hue="Interaction_Type",
    palette="plasma",
    legend=False,
    ax=axes[1, 0],
)
axes[1, 0].set_title("Interaction Volume by Type")
axes[1, 0].set_xlabel("Number of Interactions")

# Conversion Rate by Type
sns.barplot(
    data=df_interactions,
    x="Conversion_Rate",
    y="Interaction_Type",
    hue="Interaction_Type",
    palette="viridis",
    legend=False,
    ax=axes[1, 1],
)
axes[1, 1].set_title("Conversion Rate by Interaction Type")
axes[1, 1].set_xlabel("Conversion Rate (%)")

plt.tight_layout()
plt.savefig(output_dir / "10_channel_performance_acquisition.png", dpi=300, bbox_inches="tight")
plt.show()

## 8. Customer Segmentation Deep Dive
Detailed analysis of customer clusters and their characteristics.


In [None]:
# Load cluster characteristics
df_segments = con.execute(
    """
    SELECT 
        cluster_risposta as Cluster,
        COUNT(*) as Customer_Count,
        AVG(eta) as Avg_Age,
        AVG(reddito) as Avg_Income,
        AVG(num_polizze) as Avg_Policies,
        AVG(clv_stimato) as Avg_CLV,
        AVG(engagement_score) as Avg_Engagement,
        AVG(churn_probability) as Avg_Churn_Risk,
        AVG(satisfaction_score) as Avg_Satisfaction
    FROM aida_challenge.main_staging.stg_clienti
    WHERE cluster_risposta IS NOT NULL
    GROUP BY cluster_risposta
"""
).df()

# Create cluster profile visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Cluster sizes
colors_okabe = ["#0173B2", "#F0E442", "#56B4E9", "#D55E00", "#CC79A7"]
axes[0, 0].pie(
    df_segments["Customer_Count"],
    labels=df_segments["Cluster"],
    autopct="%1.1f%%",
    colors=colors_okabe,
    startangle=90,
)
axes[0, 0].set_title("Customer Distribution by Cluster")

# CLV by Cluster
sns.barplot(
    data=df_segments,
    x="Cluster",
    y="Avg_CLV",
    hue="Cluster",
    palette=colors_okabe,
    legend=False,
    ax=axes[0, 1],
)
axes[0, 1].set_title("Average CLV by Cluster")
axes[0, 1].set_xticklabels(axes[0, 1].get_xticklabels(), rotation=45)
axes[0, 1].set_ylabel("Average CLV (€)")

# Engagement vs Satisfaction
sns.scatterplot(
    data=df_segments,
    x="Avg_Engagement",
    y="Avg_Satisfaction",
    size="Customer_Count",
    hue="Cluster",
    palette=colors_okabe,
    sizes=(100, 1000),
    alpha=0.7,
    ax=axes[1, 0],
)
axes[1, 0].set_title("Engagement vs Satisfaction by Cluster")
axes[1, 0].set_xlabel("Average Engagement Score")
axes[1, 0].set_ylabel("Average Satisfaction Score")

# Churn Risk comparison
sns.barplot(
    data=df_segments,
    x="Cluster",
    y="Avg_Churn_Risk",
    hue="Cluster",
    palette=colors_okabe,
    legend=False,
    ax=axes[1, 1],
)
axes[1, 1].set_title("Churn Risk by Cluster")
axes[1, 1].set_xticklabels(axes[1, 1].get_xticklabels(), rotation=45)
axes[1, 1].set_ylabel("Average Churn Probability")
axes[1, 1].axhline(y=0.5, color="red", linestyle="--", label="High Risk (50%)")
axes[1, 1].legend()

plt.tight_layout()
plt.savefig(output_dir / "11_customer_segmentation_deep_dive.png", dpi=300, bbox_inches="tight")
plt.show()

## 9. Claims & Risk Analysis
Analyzing claims frequency, severity, and status.


In [None]:
# Load claims data
df_claims = con.execute(
    """
    SELECT 
        data_sinistro as Claim_Date,
        importo_liquidato as Claim_Amount,
        stato_liquidazione as Status,
        prodotto as Product
    FROM aida_challenge.main_staging.stg_sinistri
    WHERE data_sinistro IS NOT NULL
"""
).df()

# Convert date
df_claims["Claim_Date"] = pd.to_datetime(df_claims["Claim_Date"])

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Claims over time
claims_over_time = df_claims.set_index("Claim_Date").resample("ME").size()
claims_over_time.plot(ax=axes[0, 0], color="#d62728")
axes[0, 0].set_title("Monthly Claims Volume")
axes[0, 0].set_ylabel("Number of Claims")

# Claim Amount Distribution (Log Scale)
sns.histplot(
    data=df_claims, x="Claim_Amount", bins=50, log_scale=True, ax=axes[0, 1], color="#d62728"
)
axes[0, 1].set_title("Claim Amount Distribution (Log Scale)")

# Status Distribution
status_counts = df_claims["Status"].value_counts()
axes[1, 0].pie(
    status_counts, labels=status_counts.index, autopct="%1.1f%%", colors=sns.color_palette("Reds_d")
)
axes[1, 0].set_title("Claims Status Distribution")

# Claims by Product
product_claims = df_claims["Product"].value_counts().head(10)
sns.barplot(x=product_claims.values, y=product_claims.index, palette="Reds_d", ax=axes[1, 1])
axes[1, 1].set_title("Top 10 Products by Claim Volume")

plt.tight_layout()
plt.savefig(output_dir / "12_claims_analysis.png", dpi=300, bbox_inches="tight")
plt.show()

## 10. Competitor Analysis
Benchmarking against market competitors.


In [None]:
# Load competitor data
df_competitors = con.execute(
    """
    SELECT 
        competitor,
        tipo_prodotto as Product_Type,
        premio_medio as Avg_Premium,
        rating_clienti as Rating,
        quota_mercato_perc as Market_Share
    FROM aida_challenge.main_staging.stg_competitor_prodotti
"""
).df()

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Price vs Rating
sns.scatterplot(
    data=df_competitors,
    x="Avg_Premium",
    y="Rating",
    size="Market_Share",
    hue="Competitor",
    sizes=(100, 1000),
    alpha=0.7,
    ax=axes[0],
)
axes[0].set_title("Competitor Positioning: Price vs Rating")
axes[0].set_xlabel("Average Premium (€)")

# Market Share
market_share = (
    df_competitors.groupby("Competitor")["Market_Share"].sum().sort_values(ascending=False)
)
sns.barplot(
    x=market_share.values,
    y=market_share.index,
    palette="Set2",
    ax=axes[1],
    hue=market_share.index,
    legend=False,
)
axes[1].set_title("Total Market Share by Competitor")
axes[1].set_xlabel("Market Share (%)")

plt.tight_layout()
plt.savefig(output_dir / "13_competitor_analysis.png", dpi=300, bbox_inches="tight")
plt.show()

## 11. Complaints Analysis
Understanding customer pain points.


In [None]:
# Load complaints data
df_complaints = con.execute(
    """
    SELECT 
        prodotto as Product,
        area_bisogno as Need_Area,
        reclami_e_info as Complaint_Type
    FROM aida_challenge.main_staging.stg_reclami
"""
).df()

plt.figure(figsize=(12, 6))
complaint_counts = df_complaints["Product"].value_counts().head(10)
sns.barplot(x=complaint_counts.values, y=complaint_counts.index, palette="magma")
plt.title("Top 10 Products by Complaint Volume")
plt.xlabel("Number of Complaints")
plt.tight_layout()
plt.savefig(output_dir / "14_complaints_by_product.png", dpi=300, bbox_inches="tight")
plt.show()

## 12. Housing Assets
Analyzing property characteristics and protection.


In [None]:
# Load housing data
df_housing = con.execute(
    """
    SELECT 
        metratura as Size_Sqm,
        sistema_allarme as Has_Alarm,
        luogo_residenza as City
    FROM aida_challenge.main_staging.stg_abitazioni
"""
).df()

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Size Distribution
sns.histplot(data=df_housing, x="Size_Sqm", bins=30, kde=True, ax=axes[0], color="green")
axes[0].set_title("Property Size Distribution")
axes[0].set_xlabel("Square Meters")

# Alarm System Adoption
alarm_counts = df_housing["Has_Alarm"].value_counts()
axes[1].pie(
    alarm_counts, labels=alarm_counts.index, autopct="%1.1f%%", colors=["#2ca02c", "#d62728"]
)
axes[1].set_title("Alarm System Adoption Rate")

plt.tight_layout()
plt.savefig(output_dir / "15_housing_assets.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Clean up
con.close()
print("✓ Database connection closed")