In [1]:
# Step 7: Cluster profiling + auto-recommendations
import pandas as pd
import numpy as np
import os
import json
from scipy.stats import zscore

# paths (adjust if different)
cluster_csv = "../data_processed/umap_clusters.csv"
out_dir = "../data_processed"
os.makedirs(out_dir, exist_ok=True)

# load
df = pd.read_csv(cluster_csv)
print("Loaded:", df.shape)

# select numeric features to profile (auto-detect reasonable numeric cols)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Exclude umap_x, umap_y from profiling if present
numeric_cols = [c for c in numeric_cols if c not in ("umap_x","umap_y","cluster_hdbscan")]

# compute cluster medians and counts
profile = df.groupby("cluster_hdbscan")[numeric_cols].median().T
counts = df.groupby("cluster_hdbscan").size().rename("count")
profile = profile.join(counts.T).T if False else pd.concat([profile, counts.rename('count').to_frame().T]).T
# simpler: produce table of medians and counts
cluster_medians = df.groupby("cluster_hdbscan")[numeric_cols].median()
cluster_counts = df["cluster_hdbscan"].value_counts().sort_index()
cluster_profile = cluster_medians.copy()
cluster_profile["count"] = cluster_counts

cluster_profile.to_csv(os.path.join(out_dir, "cluster_profile.csv"))
print("Saved cluster_profile.csv")

# auto-describe clusters by comparing cluster medians to global median
global_median = df[numeric_cols].median()
cluster_descriptions = {}
for cid, row in cluster_medians.iterrows():
    diffs = (row - global_median) / (global_median.replace(0, np.nan).abs())  # relative diff
    # pick top 3 features that differ most in absolute relative terms
    top_feats = diffs.abs().sort_values(ascending=False).head(3).index.tolist()
    reasons = []
    for f in top_feats:
        val = row[f]
        rel = diffs[f]
        if rel > 0.25:
            reasons.append(f"{f} is higher than average ({val:.0f})")
        elif rel < -0.25:
            reasons.append(f"{f} is lower than average ({val:.0f})")
        else:
            reasons.append(f"{f}: {val:.0f}")
    # simple auto-name heuristics
    name = f"Cluster {cid}"
    # heuristics for naming
    if "TotalTransactionAmount" in top_feats and diffs["TotalTransactionAmount"] > 0.3:
        name = "High Spenders"
    elif "AvgUtilization" in top_feats and diffs["AvgUtilization"] > 0.25:
        name = "Credit-Heavy / High Utilization"
    elif "Total_Revolving_Bal" in top_feats and diffs["Total_Revolving_Bal"] < -0.2:
        name = "Low Debt Savers"
    elif "InactiveMonths" in top_feats and diffs["InactiveMonths"] > 0.3:
        name = "Inactive / Low Activity"
    elif "RelationshipCount" in top_feats and diffs["RelationshipCount"] > 0.3:
        name = "Multi-product Engaged"
    else:
        # fallback: combine first top feat into name
        name = f"{top_feats[0]}-Focused"
    cluster_descriptions[cid] = {
        "cluster_id": int(cid),
        "count": int(cluster_counts.get(cid, 0)),
        "auto_name": name,
        "top_features": top_feats,
        "reasons": reasons
    }

# Build a simple rule-based recommendation catalog (MVP)
# You can edit these recommendations later per cluster name
default_recs = {
    "High Spenders": [
        {"title":"Spend Analyzer","summary":"Detailed insights to reduce discretionary spend","cta":"Open Spend Analyzer"},
        {"title":"Short-term Savings Bucket","summary":"Auto-save small % of each transaction","cta":"Enable Saver"}
    ],
    "Credit-Heavy / High Utilization": [
        {"title":"Debt Optimization Plan","summary":"Reduce revolving balance & interest","cta":"See Plan"},
        {"title":"Balance Transfer Offer","summary":"Lower interest transfer options", "cta":"Apply"}
    ],
    "Low Debt Savers": [
        {"title":"Fixed Deposit (6M)","summary":"Low-risk interest for steady savers","cta":"Open FD"},
        {"title":"Sweep-in Savings","summary":"Earn more while keeping liquidity","cta":"Activate"}
    ],
    "Inactive / Low Activity": [
        {"title":"Engagement Offer","summary":"Personalized offers to re-activate account","cta":"View Offers"},
        {"title":"Round-up Micro-savings","summary":"Automatically save small amounts","cta":"Turn On"}
    ],
    "Multi-product Engaged": [
        {"title":"Premium Relationship Offer","summary":"Bundle offers & loyalty benefits","cta":"View Bundle"},
        {"title":"Wealth Starter SIP","summary":"Begin small SIPs into mutual funds","cta":"Start SIP"}
    ]
}

# Create recs.json structure per detected cluster name
recs = {}
for cid, meta in cluster_descriptions.items():
    cluster_name = meta["auto_name"]
    recs[str(int(cid))] = {
        "cluster_name": cluster_name,
        "count": meta["count"],
        "products": default_recs.get(cluster_name, [
            {"title":"Basic Saver Plan","summary":"Starter plan to build savings","cta":"Learn more"}
        ]),
        "explain": meta["reasons"]
    }

# Per-user explanations: top 2 feature sentences for each user
explanations = {}
for idx, row in df.iterrows():
    cid = row["cluster_hdbscan"]
    if pd.isna(cid):
        cid = -1
    cid = int(cid)
    # compare user's numeric features to cluster median for that cid
    user_reasons = []
    if cid in cluster_medians.index:
        med = cluster_medians.loc[cid]
        # compute difference and pick top two features
        diffs = (row[numeric_cols] - med).abs().sort_values(ascending=False)
        top2 = [f for f in diffs.head(2).index.tolist() if f in numeric_cols]
        for f in top2:
            val = row[f]
            user_reasons.append(f"{f}: {val:.0f}")
    else:
        user_reasons.append("Pattern: atypical / outlier")
    explanations[row.get("Customer_ID", row.get("CLIENTNUM", idx))] = user_reasons

# Save outputs
with open(os.path.join(out_dir, "cluster_map.json"), "w", encoding="utf-8") as f:
    json.dump(cluster_descriptions, f, indent=2)

with open(os.path.join(out_dir, "recs.json"), "w", encoding="utf-8") as f:
    json.dump(recs, f, indent=2)

with open(os.path.join(out_dir, "explanations.json"), "w", encoding="utf-8") as f:
    json.dump(explanations, f, indent=2)

print("Saved cluster_map.json, recs.json, explanations.json in", out_dir)
# quick prints
display(pd.DataFrame.from_dict(cluster_descriptions, orient="index"))


Loaded: (10127, 23)
Saved cluster_profile.csv
Saved cluster_map.json, recs.json, explanations.json in ../data_processed


Unnamed: 0,cluster_id,count,auto_name,top_features,reasons
-1,-1,1175,Avg_Open_To_Buy-Focused,"[Avg_Open_To_Buy, Credit_Limit, AvgUtilization]",[Avg_Open_To_Buy is higher than average (9959)...
0,0,206,High Spenders,"[TotalTransactionAmount, Avg_Open_To_Buy, Cred...",[TotalTransactionAmount is higher than average...
1,1,249,High Spenders,"[TotalTransactionAmount, Avg_Open_To_Buy, Cred...",[TotalTransactionAmount is higher than average...
2,2,160,High Spenders,"[TotalTransactionAmount, Avg_Open_To_Buy, Cred...",[TotalTransactionAmount is higher than average...
3,3,154,High Spenders,"[Avg_Open_To_Buy, Credit_Limit, TotalTransacti...",[Avg_Open_To_Buy is higher than average (8324)...
4,4,238,Credit-Heavy / High Utilization,"[AvgUtilization, RelationshipCount, Avg_Open_T...","[AvgUtilization is higher than average (0), Re..."
5,5,175,High Spenders,"[Avg_Open_To_Buy, Credit_Limit, TotalTransacti...",[Avg_Open_To_Buy is higher than average (8816)...
6,6,7770,ContactsLast12M-Focused,"[ContactsLast12M, Avg_Open_To_Buy, AvgUtilizat...","[ContactsLast12M is higher than average (3), A..."
