# Dashboard Avanzado de Segmentación de Clientes (RFM, Clusters, Insights Dinámicos)

Este dashboard implementa:
- KPIs generales y EDA visual
- Segmentación RFM + diversidad + tópicos de productos
- Clustering, radar, storytelling y recomendaciones
- Tabla interactiva, exportación robusta, predicción de cluster para nuevos clientes
- Paneles de "Hallazgo clave" y "Insights automáticos" accionables
- **Selector dinámico de número de clusters (K)**
- **Filtro de rango de fechas global**
- Mejores prácticas para ejecución fuera de Jupyter (`app.run(debug=True, port=8052)`)

**Requisitos:**
- `pip install pandas numpy scikit-learn plotly dash dash-bootstrap-components scipy jupyter-dash`
- Archivo `Data_Set_Global.xlsx` con hoja `Pedidos`.

## 1. Importación de librerías y configuración global

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, NMF
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output, State, dash_table, no_update, callback_context
import dash_bootstrap_components as dbc
from scipy.stats import entropy
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

## 2. Carga de datos y preprocesamiento básico
Incluye un filtro de fechas global para todo el dashboard.

In [2]:
file_path = "Data_Set_Global.xlsx"
pedidos = pd.read_excel(file_path, sheet_name="Pedidos")
pedidos["total_price"] = pedidos["quantity"] * pedidos["unit_price"]
pedidos["order_date"] = pd.to_datetime(pedidos["order_date"])
pedidos["promised_delivery_time"] = pd.to_datetime(pedidos["promised_delivery_time"])
pedidos["actual_delivery_time"] = pd.to_datetime(pedidos["actual_delivery_time"])
pedidos["delivery_delay_min"] = (
    (pedidos["actual_delivery_time"] - pedidos["promised_delivery_time"]).dt.total_seconds() / 60
)
pedidos = pedidos.dropna(subset=["order_id", "customer_id", "order_date"]).copy()

## 3. Definición de funciones de procesamiento y clustering dinámico
Permite cambiar el número de clusters (K) y aplica los pasos de RFM, diversidad y NMF con los datos filtrados por fechas.

In [3]:
def preparar_segmentacion(pedidos_filtrado, K=3, nmf_topics=10):
    # RFM
    snapshot_date = pedidos_filtrado["order_date"].max() + dt.timedelta(days=1)
    rfm = pedidos_filtrado.groupby("customer_id").agg(
        recency = ("order_date", lambda date: (snapshot_date - date.max()).days),
        frequency = ("order_id", "nunique"),
        monetary = ("total_price", "sum")
    )
    scaler_rfm = StandardScaler()
    rfm_z = pd.DataFrame(scaler_rfm.fit_transform(rfm), index=rfm.index, columns=[c+"_z" for c in rfm.columns])
    # Diversidad de métodos de pago
    pay_counts = pedidos_filtrado.groupby(["customer_id", "payment_method"]).size().unstack(fill_value=0)
    pay_probs = pay_counts.div(pay_counts.sum(axis=1), axis=0)
    payment_entropy = pay_probs.apply(lambda row: entropy(row, base=2), axis=1)
    customer_payment_diversity = payment_entropy.rename("payment_entropy").reset_index()
    # NMF tópicos de producto
    cust_prod = pedidos_filtrado.groupby(["customer_id", "product_id"]).size().unstack(fill_value=0)
    nmf = NMF(n_components=nmf_topics, init="random", random_state=42)
    W = nmf.fit_transform(cust_prod)
    customer_topics = pd.DataFrame(W, index=cust_prod.index, columns=[f"topic_{i+1}" for i in range(nmf_topics)])
    # Dataset final
    full_df = (
        rfm_z
        .join(customer_payment_diversity.set_index("customer_id"))
        .join(customer_topics)
    ).fillna(0)
    X_scaled = StandardScaler().fit_transform(full_df.select_dtypes(include=np.number))
    # Clustering y perfiles
    kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_scaled)
    full_df["cluster"] = cluster_labels
    full_df.index.name = "customer_id"
    profile_cols = [c for c in full_df.columns if c.startswith(('recency','frequency','monetary','payment_entropy','topic_'))]
    cluster_profile = full_df.groupby("cluster")[profile_cols].mean().round(2)
    # Visualización PCA
    pca = PCA(n_components=2)
    coords = pca.fit_transform(X_scaled)
    df_plot = full_df.reset_index().copy()
    df_plot['PCA1'] = coords[:,0]
    df_plot['PCA2'] = coords[:,1]
    return full_df, cluster_profile, kmeans, scaler_rfm, customer_topics, df_plot

## 4. Definición de funciones visuales y de insights

In [4]:
def radar_cluster(cluster_id, cluster_profile):
    variables = ['recency_z', 'frequency_z', 'monetary_z', 'payment_entropy']
    vals = cluster_profile.loc[cluster_id, variables].tolist()
    fig = go.Figure(data=go.Scatterpolar(
        r=vals + [vals[0]],
        theta=[v.replace('_z','').capitalize() for v in variables] + [variables[0].replace('_z','').capitalize()],
        fill='toself', name=f"Cluster {cluster_id}"
    ))
    fig.update_layout(polar=dict(radialaxis=dict(visible=True)), showlegend=False,
                      title=f"Perfil visual del Cluster {cluster_id}")
    return fig

def cluster_recommendation(cluster_id):
    recs = {
        0: "🎉 Recompensa la lealtad con descuentos VIP y cross-selling de productos premium.",
        1: "📩 Lanza campañas de reactivación personalizadas y ofertas de bienvenida.",
        2: "🛒 Aprovecha fechas clave, combos y recordatorios de recompra para aumentar frecuencia."
    }
    return html.Div([
        html.H5("Recomendación de Negocio:"),
        html.P(recs.get(cluster_id, "Analiza el comportamiento de este segmento para definir la mejor acción."))
    ], style={"backgroundColor":"#e3f9e5", "padding":"1em", "borderRadius":"8px"})

def cluster_summary_narrative(cluster_id, cluster_profile, cluster_names):
    row = cluster_profile.loc[cluster_id]
    name = cluster_names.get(cluster_id, f"Cluster {cluster_id}")
    resumen = f"""
### {name}
• Recency (recencia): {row['recency_z']:.2f}
• Frequency (frecuencia): {row['frequency_z']:.2f}
• Monetary (monto): {row['monetary_z']:.2f}
• Diversidad de pago: {row['payment_entropy']:.2f}
"""
    return resumen

def actionable_insights(cluster_profile):
    insights = []
    for c, row in cluster_profile.iterrows():
        if row['recency_z'] < -0.5 and row['frequency_z'] > 0:
            insights.append(f"✅ El cluster {c} agrupa clientes muy activos y recientes: prioriza cross-selling y programas de fidelidad.")
        if row['recency_z'] > 0.5 and row['frequency_z'] < 0:
            insights.append(f"⚠️ El cluster {c} muestra clientes inactivos: lanza campañas de reactivación y retención.")
        if row['payment_entropy'] > 0.8:
            insights.append(f"ℹ️ El cluster {c} destaca por alta diversidad de métodos de pago: promueve beneficios por multicanalidad.")
        if row['monetary_z'] > 0.5:
            insights.append(f"💰 El cluster {c} gasta por encima del promedio: considera ofertas exclusivas o upgrades.")
    return insights

## 5. Layout DASH avanzado (filtros, selector K, paneles y exportación)

- Selector global de rango de fechas
- Selector dinámico de número de clusters (K)
- Paneles de insights, hallazgos, storytelling y radar
- Tabla filtrable/exportable, predicción de cluster

In [5]:
# Primeros valores default para filtros
min_date = pedidos['order_date'].min().date()
max_date = pedidos['order_date'].max().date()

# Nombres amigables para clusters (se actualizan dinámicamente)
def get_cluster_names(K):
    names = {
        0: "Champions conveniencia 🏆",
        1: "Dormant básicos 💤",
        2: "Compradores Ocasionales 🎯",
        3: "Leales multicanal 💳",
        4: "Prometedores 💡"
    }
    return {i: names.get(i, f"Cluster {i}") for i in range(K)}

app = Dash(__name__, external_stylesheets=[dbc.themes.MINTY])
app.layout = dbc.Container([
    html.H2("📊 Storytelling: Segmentación de Clientes Avanzado", style={"marginTop": 15}),
    html.Hr(),
    dbc.Row([
        dbc.Col([
            html.Label('Rango de fechas (todas las visualizaciones):'),
            dcc.DatePickerRange(
                id='date-range',
                min_date_allowed=min_date,
                max_date_allowed=max_date,
                start_date=min_date,
                end_date=max_date
            )
        ], width=4),
        dbc.Col([
            html.Label('Número de clusters (K):'),
            dcc.Slider(id='slider-k', min=2, max=5, step=1, value=3,
                       marks={i:str(i) for i in range(2,6)}, tooltip={"placement":"bottom"})
        ], width=4)
    ]),
    html.Br(),
    html.Div(id='kpi-panel'),
    html.Br(),
    dbc.Row([
        dbc.Col([dcc.Graph(id='bar-segment')], width=6),
        dbc.Col([dcc.Graph(id='corr-matrix')], width=6),
    ]),
    dbc.Row([
        dbc.Col([dcc.Graph(id='delay-hist')], width=6),
        dbc.Col([dcc.Graph(id='conversion-hist')], width=6),
    ]),
    html.Br(),
    html.Div(id='top5-prod'),
    html.Hr(),
    html.H3("Clusters de clientes (mapa visual)"),
    dcc.Graph(id='fig-pca'),
    html.Br(),
    html.H3("Perfiles, recomendaciones y radar de clusters"),
    dcc.Dropdown(id='cluster-desc-dropdown', value=0, clearable=False),
    html.Div(id='cluster-desc-panel', style={"whiteSpace": "pre-line", "padding": "1em", "background": "#f8f8f8", "borderRadius": "10px"}),
    dcc.Graph(id='cluster-radar'),
    html.Div(id='cluster-recommend'),
    html.Br(),
    html.Div([html.H4("🧠 Insights automáticos por cluster"), html.Ul(id='insights-auto')],
             style={"backgroundColor":"#e9f7fd", "padding":"1em", "borderLeft":"6px solid #17a2b8"}),
    html.Hr(),
    html.H3("Comparativa, evolución y hallazgos automáticos"),
    dcc.Graph(id='fig-kpi'),
    dcc.Graph(id='fig-evol'),
    html.Div([html.H4("🔎 Hallazgo clave"), html.Ul(id="auto-insight-list")],
            style={"backgroundColor":"#FFF3CD", "padding":"1em", "borderLeft":"6px solid #FFBF00"}),
    html.Hr(),
    html.H3("Explora, filtra y acciona"),
    dcc.Dropdown(id='dropdown-cluster', multi=True, value=[], clearable=True, placeholder='Filtrar por cluster...'),
    html.Button("Exportar a CSV", id="export-btn", n_clicks=0, style={"marginLeft":10}),
    dcc.Download(id="download-clients"),
    dash_table.DataTable(
        id='clientes-table',
        page_size=12, filter_action="native", sort_action="native", style_table={'overflowX': 'auto'}, style_cell={'textAlign': 'center'}
    ),
    html.Hr(),
    html.H3("Predicción de segmento para nuevo cliente"),
    html.P("Completa el perfil para predecir a qué cluster pertenecería un nuevo cliente. Consulta la explicación y sugerencia de acción."),
    dbc.Row([
        dbc.Col([
            dcc.Dropdown(
                id='input-recency_days',
                options=[
                    {'label': '0-7 días', 'value': 4},
                    {'label': '8-30 días', 'value': 19},
                    {'label': '31-90 días', 'value': 60},
                    {'label': '91-180 días', 'value': 135},
                    {'label': '181-365+ días', 'value': 273}
                ],
                placeholder='Días desde última compra'
            )
        ], width=2),
        dbc.Col([
            dcc.Dropdown(
                id='input-frequency',
                options=[
                    {'label': '1 compra', 'value': 1},
                    {'label': '2-3 compras', 'value': 2.5},
                    {'label': '4-6 compras', 'value': 5},
                    {'label': '7-15 compras', 'value': 11},
                    {'label': '16+ compras', 'value': 20}
                ],
                placeholder='Cantidad de compras'
            )
        ], width=2),
        dbc.Col([
            dcc.Dropdown(
                id='input-monetary',
                options=[
                    {'label': '<$100', 'value': 50},
                    {'label': '$100-300', 'value': 200},
                    {'label': '$301-700', 'value': 500},
                    {'label': '$701-1500', 'value': 1100},
                    {'label': '>$1500', 'value': 2000}
                ],
                placeholder='Gasto total ($)'
            )
        ], width=2),
        dbc.Col([
            dcc.Dropdown(
                id='input-n_metodos_pago',
                options=[
                    {'label': '1', 'value': 1},
                    {'label': '2', 'value': 2},
                    {'label': '3 o más', 'value': 3}
                ],
                placeholder='# métodos de pago usados'
            )
        ], width=2),
    ], className="g-2"),
    html.Br(),
    html.Button('Predecir cluster', id='predict-btn', n_clicks=0, className="btn btn-primary"),
    html.Div(id='prediction-output', style={'fontWeight': 'bold', 'paddingTop': '1em'}),
    html.Div(id='prediction-recommend'),
], fluid=True)

## 6. Callbacks Dash principales (con filtros, insights y exportación robusta)

- Todos los gráficos, tablas, insights y paneles se actualizan al cambiar fechas/K o interactuar con el dashboard.

In [6]:
@app.callback(
    [Output('kpi-panel','children'),
     Output('bar-segment','figure'),
     Output('corr-matrix','figure'),
     Output('delay-hist','figure'),
     Output('conversion-hist','figure'),
     Output('top5-prod','children'),
     Output('fig-pca','figure'),
     Output('fig-kpi','figure'),
     Output('fig-evol','figure'),
     Output('cluster-desc-dropdown','options'),
     Output('cluster-desc-dropdown','value'),
     Output('dropdown-cluster','options'),
     Output('clientes-table','columns'),
     Output('clientes-table','data'),
     Output('insights-auto','children'),
     Output('auto-insight-list','children'),
     Output('prediction-output','children'),
     Output('prediction-recommend','children'),
     Output('cluster-radar','figure'),
     Output('cluster-desc-panel','children'),
     Output('cluster-recommend','children')
    ],
    [Input('date-range','start_date'),
     Input('date-range','end_date'),
     Input('slider-k','value'),
     Input('cluster-desc-dropdown','value'),
     Input('dropdown-cluster','value'),
     Input('predict-btn','n_clicks'),
     Input('input-recency_days','value'),
     Input('input-frequency','value'),
     Input('input-monetary','value'),
     Input('input-n_metodos_pago','value')]
)
def dashboard_update(start_date, end_date, K, cluster_dropdown, clusters_filter, n_clicks, rec_days, freq, monet, n_methods):
    # Filtrado global por fechas
    pedidos_filt = pedidos[(pedidos['order_date']>=pd.to_datetime(start_date)) & (pedidos['order_date']<=pd.to_datetime(end_date))].copy()
    # KPIs
    kpi_dict = {
        "Total de Clientes Analizados": pedidos_filt["customer_id"].nunique(),
        "Órdenes Totales": pedidos_filt["order_id"].nunique(),
        "Productos Únicos": pedidos_filt["product_id"].nunique(),
        "Facturación Total ($)": round(pedidos_filt["total_price"].sum(),2),
        "Periodo Analizado": f"{str(start_date)} a {str(end_date)}"
    }
    kpi_panel = dbc.Row([
        dbc.Col(
            dbc.Card([
                dbc.CardHeader(k),
                dbc.CardBody(html.H5(f"{v:,}" if isinstance(v, (int, float)) else str(v)))
            ]), width=3
        ) for k, v in kpi_dict.items()
    ])
    # Segmentos, correlación, productos, demoras, conversión
    df_seg = pedidos_filt.groupby("customer_segment")[["order_id"]].count().reset_index()
    fig_bar_segment = px.bar(df_seg, x="customer_segment", y="order_id", text="order_id",
                        labels={"order_id": "Órdenes"}, title="Órdenes por segmento de cliente")
    num_cols = pedidos_filt.select_dtypes("number").columns
    corr = pedidos_filt[num_cols].corr().round(2)
    fig_corr = px.imshow(corr, color_continuous_scale='RdBu', title="Matriz de correlación de variables numéricas")
    fig_delay = px.histogram(pedidos_filt, x="delivery_delay_min", nbins=50, title="Demora de entrega (min)")
    agg = (
        pedidos_filt.groupby("customer_id")
        .agg(
            first_order=("order_date", "min"),
            registration_date=("registration_date", "first")
        ).reset_index()
    )
    agg["days_to_first"] = (agg["first_order"] - agg["registration_date"]).dt.days
    fig_conversion = px.histogram(agg, x="days_to_first", nbins=30, title="Días hasta la primera compra")
    N = 5
    topN = (
        pedidos_filt.groupby(["customer_segment", "product_name"])
        .size().groupby(level=0, group_keys=False).nlargest(N)
        .reset_index(name="compras")
    )
    top5_prod_segment = {seg: topN[topN["customer_segment"]==seg][["product_name","compras"]].to_dict('records') for seg in topN["customer_segment"].unique()}
    top5_prod_panel = html.Div([
        html.H5("Top 5 productos por segmento"),
        html.Ul([
            html.Li([
                html.B(f"Segmento {seg}: "),
                ', '.join([f"{d['product_name']} ({d['compras']})" for d in lst])
            ]) for seg, lst in top5_prod_segment.items()
        ])
    ])
    # Segmentación dinámica y clusterización
    full_df, cluster_profile, kmeans, scaler_rfm, customer_topics, df_plot = preparar_segmentacion(pedidos_filt, K)
    cluster_names = get_cluster_names(K)
    # Visualización PCA
    fig_pca = px.scatter(df_plot, x='PCA1', y='PCA2', color=df_plot['cluster'].astype(str),
                        title="Mapa visual de los clusters de clientes",
                        labels={"color": "Cluster"}, hover_data=['customer_id'])
    # KPI por cluster
    fig_kpi = px.bar(
        cluster_profile.reset_index().melt(id_vars='cluster', value_vars=['recency_z','frequency_z','monetary_z','payment_entropy']),
        x='variable', y='value', color='cluster', barmode='group',
        labels={'value':'Valor Z','variable':'KPI','cluster':'Cluster'},
        title="Comparativa de KPIs promedio por cluster"
    )
    pedidos_ext = pedidos_filt.merge(full_df['cluster'], left_on='customer_id', right_index=True)
    pedidos_ext['month'] = pedidos_ext['order_date'].dt.to_period('M').astype(str)
    evol = pedidos_ext.groupby(['month','cluster'])['customer_id'].nunique().reset_index()
    fig_evol = px.line(evol, x='month', y='customer_id', color='cluster', markers=True,
                       labels={'customer_id':'N° clientes','month':'Mes','cluster':'Cluster'},
                       title="Evolución temporal del tamaño de los clusters")
    def auto_insight(evol):
        ult = evol[evol['month']==evol['month'].max()]
        ant = evol[evol['month']==sorted(evol['month'].unique())[-2]] if len(evol['month'].unique())>1 else ult
        msg = []
        for c in ult['cluster']:
            v1 = ult[ult['cluster']==c]['customer_id'].values[0]
            v0 = ant[ant['cluster']==c]['customer_id'].values[0] if c in ant['cluster'].values else 0
            cambio = v1-v0
            if cambio>0:
                msg.append(f"✅ El cluster {c} creció en {cambio} clientes en el último mes.")
            elif cambio<0:
                msg.append(f"⚠️ ¡Atención! El cluster {c} perdió {-cambio} clientes en el último mes.")
        return msg
    # Opciones de clusters para dropdowns
    options_dropdown = [{"label": cluster_names[c], "value": str(c)} for c in sorted(full_df['cluster'].unique())]
    # Tabla filtrable/exportable
    filtered_df = full_df.reset_index().round(2)
    if clusters_filter:
        filtered_df = filtered_df[filtered_df['cluster'].astype(str).isin(clusters_filter)]
    columns = [{"name": i.replace('_z',' (Z)').replace('_',' ').title(), "id": i} for i in ["recency_z","frequency_z","monetary_z","payment_entropy"]+list(customer_topics.columns)+["cluster"]]
    data = filtered_df[["recency_z","frequency_z","monetary_z","payment_entropy"]+list(customer_topics.columns)+["cluster"]].to_dict('records')
    # Insights automáticos y hallazgos clave
    actionable = [html.Li(i) for i in actionable_insights(cluster_profile)]
    hallazgos = [html.Li(m) for m in auto_insight(evol)]
    # Panel de storytelling y radar cluster
    cluster_val = cluster_dropdown if cluster_dropdown is not None and cluster_dropdown in cluster_profile.index else 0
    radar_fig = radar_cluster(cluster_val, cluster_profile)
    desc_text = cluster_summary_narrative(cluster_val, cluster_profile, cluster_names)
    recommend_panel = cluster_recommendation(cluster_val)
    # Predicción nuevo cliente
    pred_text = pred_rec = ""
    if n_clicks and all(v is not None for v in [rec_days, freq, monet, n_methods]):
        arr = np.zeros((1,3))
        arr[0,0] = rec_days
        arr[0,1] = freq
        arr[0,2] = monet
        recency_val, frequency_val, monetary_val = scaler_rfm.transform(arr)[0]
        probs = np.full(n_methods, 1/n_methods)
        payment_entropy_val = entropy(probs, base=2)
        topics_vals = [0]*customer_topics.shape[1]
        X_new = np.array([[recency_val, frequency_val, monetary_val, payment_entropy_val] + topics_vals])
        cluster_pred = kmeans.predict(X_new)[0]
        pred_text = cluster_summary_narrative(cluster_pred, cluster_profile, cluster_names)
        pred_rec = cluster_recommendation(cluster_pred)
    # Todo el output (en orden de los Outputs arriba)
    return (
        kpi_panel, fig_bar_segment, fig_corr, fig_delay, fig_conversion, top5_prod_panel, fig_pca, fig_kpi, fig_evol,
        [{"label": cluster_names[c], "value": c} for c in sorted(cluster_profile.index)], cluster_val,
        options_dropdown, columns, data, actionable, hallazgos,
        pred_text, pred_rec, radar_fig, desc_text, recommend_panel
    )

## 7. Callback de exportación a CSV robusto

In [7]:
@app.callback(
    Output('download-clients', 'data'),
    Input('export-btn', 'n_clicks'),
    State('clientes-table', 'data'),
    prevent_initial_call=True
)
def export_csv(n_clicks, table_data):
    if n_clicks:
        df_export = pd.DataFrame(table_data)
        return dcc.send_data_frame(df_export.to_csv, "clientes_segmentados.csv", index=False)
    return no_update

## 8. Ejecución del dashboard fuera de Jupyter

Abre tu navegador en http://127.0.0.1:8052/

In [8]:
if __name__ == "__main__":
    app.run(debug=True, port=8052)

  File "C:\Users\aresu\ANACONDA\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\aresu\ANACONDA\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\aresu\ANACONDA\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\aresu\ANACONDA\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\aresu\ANACONDA\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\aresu\ANACONDA\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
