# Data Visualization

To load dependencies, run:
```bash
uv sync --extra analysis
```

In [None]:
import duckdb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from pathlib import Path

# Set stylesns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
# Connect to database
db_path = Path("data/aida_challenge.duckdb").absolute()
con = duckdb.connect(str(db_path))
print(f"✓ Connected to database at {db_path}")

## 1. Customer Demographics
Understanding our customer base through Age, Income, and Profession distribution.

In [None]:
# Load customer data
df_customers = con.execute(
    """
    SELECT 
        eta as Age,
        reddito as Income,
        professione as Profession,
        luogo_residenza as City
    FROM aida_challenge.main_staging.stg_clienti
"""
).df()

# Create subplots for Age and Income
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Age Distribution
sns.histplot(data=df_customers, x="Age", bins=30, kde=True, ax=axes[0], color="#3182bd")
axes[0].set_title("Customer Age Distribution")

# Income Distribution
sns.histplot(data=df_customers, x="Income", bins=30, kde=True, ax=axes[1], color="#756bb1")
axes[1].set_title("Customer Income Distribution")

plt.tight_layout()
plt.show()

In [None]:
# Profession Count
plt.figure(figsize=(12, 6))
profession_counts = df_customers["Profession"].value_counts().head(10)
sns.barplot(
    x=profession_counts.values,
    y=profession_counts.index,
    palette="viridis",
    legend=False,
    hue=profession_counts.index,
)
plt.title("Top 10 Professions")
plt.xlabel("Count")
plt.show()

## 2. Portfolio Analysis
Analyzing the distribution of products and premiums.

In [None]:
# Load policy data
df_policies = con.execute(
    """
    SELECT 
        prodotto as Product,
        area_bisogno as Need_Area,
        premio_totale_annuo as Annual_Premium
    FROM aida_challenge.main_staging.stg_polizze
"""
).df()

# Premium by Need Area
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=df_policies,
    x="Need_Area",
    y="Annual_Premium",
    hue="Need_Area",
    palette="cividis",
    legend=False,
)
plt.title("Annual Premium Distribution by Need Area")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Total Premium by Product
product_premium = (
    df_policies.groupby("Product")["Annual_Premium"].sum().sort_values(ascending=False)
)
plt.figure(figsize=(12, 6))
sns.barplot(
    x=product_premium.values,
    y=product_premium.index,
    palette="plasma",
    hue=product_premium.index,
    legend=False,
)
plt.title("Total Annual Premium by Product")
plt.xlabel("Total Premium (€)")
plt.show()

## 3. Customer Value & Risk
Exploring the relationship between Engagement, Churn Probability, and Customer Lifetime Value (CLV).

In [None]:
# Load value metrics
df_value = con.execute(
    """
    SELECT 
        engagement_score,
        churn_probability,
        clv_stimato as CLV,
        cluster_risposta as Cluster
    FROM aida_challenge.main_staging.stg_clienti
    WHERE engagement_score IS NOT NULL
"""
).df()

# Scatter plot: Engagement vs Churn with protan-friendly colors
fig, ax = plt.subplots(figsize=(12, 7))
clusters = df_value["Cluster"].unique()

# Okabe-Ito colorblind-safe palette (excellent for protan)
colors = ["#0173B2", "#F0E442", "#56B4E9", "#D55E00", "#CC79A7", "#009E73", "#000000"]

for i, cluster in enumerate(sorted(clusters)):
    cluster_data = df_value[df_value["Cluster"] == cluster]
    ax.scatter(
        cluster_data["engagement_score"],
        cluster_data["churn_probability"],
        color=colors[i % len(colors)],
        s=100,
        alpha=0.6,
        label=f"{cluster}",
        edgecolors="white",
        linewidths=1.5,
    )

ax.set_xlabel("Engagement Score", fontsize=12, fontweight="bold")
ax.set_ylabel("Churn Probability", fontsize=12, fontweight="bold")
ax.set_title("Engagement Score vs. Churn Probability", fontsize=14, fontweight="bold")
ax.legend(title="Cluster", loc="upper right", frameon=True, shadow=True)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# CLV Distribution by Cluster
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_value, x="Cluster", y="CLV", hue="Cluster", palette="colorblind", legend=False)
plt.title("Customer Lifetime Value by Cluster")
plt.show()

## 4. Geographic Distribution
Mapping customer locations with density heatmap.

In [None]:
# Load customer geographic data
df_geo = con.execute(
    """
    SELECT 
        codice_cliente,
        latitudine as lat,
        longitudine as lon,
        luogo_residenza as city,
        clv_stimato as clv
    FROM aida_challenge.main_staging.stg_clienti
    WHERE latitudine IS NOT NULL 
        AND longitudine IS NOT NULL
"""
).df()

print(f"Loaded {len(df_geo):,} customers with location data")
df_geo.head()

In [None]:
fig = px.scatter_map(
    df_geo.sample(min(7000, len(df_geo))),  # Sample for performance
    lat="lat",
    lon="lon",
    color="clv",
    size="clv",
    hover_data=["city", "clv"],
    color_continuous_scale="Plasma",
    size_max=15,
    zoom=5,
    map_style="open-street-map",
    title="Customer Locations by CLV",
    height=600,
)

fig.update_layout(
    margin={"r": 0, "t": 40, "l": 0, "b": 0}, coloraxis_colorbar=dict(title="CLV (€)")
)

fig.show()

In [None]:
# Clean up
con.close()
print("✓ Database connection closed")