In [None]:
# ==============================
# Customer Behavior Clustering
# ==============================

# --- Import libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.manifold import TSNE

# ==============================
# Load dataset
# ==============================
df = pd.read_csv('/content/customer_data.csv', sep='\t')  # adjust separator if needed

# ==============================
# Feature Engineering
# ==============================
current_year = 2025
df['Age'] = current_year - df['Year_Birth']

# Define purchase-related columns
purchase_cols = [
    'MntWines','MntFruits','MntMeatProducts','MntFishProducts',
    'MntSweetProducts','MntGoldProds',
    'NumDealsPurchases','NumWebPurchases','NumCatalogPurchases',
    'NumStorePurchases','NumWebVisitsMonth','Recency','Complain'
]

# Compute total spend and ratios for each product category
df['Total_Spend'] = df[['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds']].sum(axis=1)
for col in ['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds']:
    df[f'{col}_Ratio'] = df[col] / df['Total_Spend']

# Compute campaign acceptance rate
campaign_cols = ['AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5']
df['CampaignAcceptanceRate'] = df[campaign_cols].mean(axis=1)

# Define final features for clustering
features = ['Total_Spend'] + [f'{col}_Ratio' for col in ['MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds']] + \
           ['NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumStorePurchases','NumWebVisitsMonth','Recency','Complain','CampaignAcceptanceRate']

X = df[features]

# ==============================
# Feature Scaling
# ==============================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ==============================
# K-Means Clustering
# ==============================
k = 4
kmeans = KMeans(n_clusters=k, random_state=42)
df['KMeans_Cluster'] = kmeans.fit_predict(X_scaled)

# Elbow method to visualize optimal clusters
wcss = [KMeans(n_clusters=i, random_state=42).fit(X_scaled).inertia_ for i in range(1, 11)]
plt.figure(figsize=(8,4))
plt.plot(range(1, 11), wcss, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS (Inertia)')
plt.title('Elbow Method for Optimal k')
plt.show()

# ==============================
# Hierarchical Clustering
# ==============================
linked = linkage(X_scaled, method='ward')
plt.figure(figsize=(12,6))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=False)
plt.title('Hierarchical Clustering Dendrogram')
plt.show()

# Silhouette scores for different cluster counts
for k_test in range(2, 11):
    hier_labels = fcluster(linked, t=k_test, criterion='maxclust')
    score = silhouette_score(X_scaled, hier_labels)
    print(f"Silhouette score for hierarchical k={k_test}: {score:.3f}")

# Cut dendrogram
df['Hierarchical_Cluster'] = fcluster(linked, t=k, criterion='maxclust')

# ==============================
# Clusters visualization with t-SNE
# ==============================

# --- Compute t-SNE embedding ---
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X_scaled)

# --- K-Means t-SNE plot ---
plt.figure(figsize=(8,6))
for label in df['KMeans_Cluster'].unique():
    plt.scatter(
        X_tsne[df['KMeans_Cluster']==label, 0],
        X_tsne[df['KMeans_Cluster']==label, 1],
        alpha=0.6,
        label=f'Cluster {label}'
    )
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title('K-Means Clusters visualized with t-SNE')
plt.legend()
plt.show()

# --- Hierarchical t-SNE plot ---
plt.figure(figsize=(8,6))
for label in df['Hierarchical_Cluster'].unique():
    plt.scatter(
        X_tsne[df['Hierarchical_Cluster']==label, 0],
        X_tsne[df['Hierarchical_Cluster']==label, 1],
        alpha=0.6,
        label=f'Cluster {label}'
    )
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title('Hierarchical Clusters visualized with t-SNE')
plt.legend()
plt.show()

# ==============================
# Cluster Summary
# ==============================
print("K-Means Cluster Summary:\n", df.groupby('KMeans_Cluster')[features].mean())
print("\nHierarchical Cluster Summary:\n", df.groupby('Hierarchical_Cluster')[features].mean())

# ==============================
# Interactive Cluster Comparison
# ==============================
# Prepare K-Means cluster summary
kmeans_summary = df.groupby('KMeans_Cluster')[features].mean().reset_index()
kmeans_summary['Cluster'] = kmeans_summary['KMeans_Cluster']
kmeans_summary.drop(columns=['KMeans_Cluster', 'Recency'], inplace=True)

# Scale features for plotting
scaler = MinMaxScaler()
kmeans_features = kmeans_summary.drop(columns='Cluster')
kmeans_scaled = pd.DataFrame(scaler.fit_transform(kmeans_features), columns=kmeans_features.columns)
kmeans_scaled['Cluster'] = kmeans_summary['Cluster']

# Define custom cluster names
kmeans_names = {
    0: "K0 Moderate Spenders",
    1: "K1 Low Spenders",
    2: "K2 Occasional Shoppers",
    3: "K3 Big Spenders"
}

# Plot function for interactive comparison
def plot_cluster_comparison_interactive(df_scaled, df_raw, title, cluster_names):
    df_scaled_melt = df_scaled.melt(id_vars='Cluster', var_name='Metric', value_name='ScaledValue')
    df_raw_melt = df_raw.melt(id_vars='Cluster', var_name='Metric', value_name='RawValue')
    df_melt = pd.merge(df_scaled_melt, df_raw_melt, on=['Cluster','Metric'])
    df_melt['Cluster'] = df_melt['Cluster'].map(cluster_names)

    fig = px.bar(
        df_melt,
        x='Metric',
        y='ScaledValue',
        color='Cluster',
        barmode='group',
        title=title,
        labels={'ScaledValue': 'Scaled Value', 'Metric': 'Metric'},
        hover_data={'RawValue': ':.2f', 'ScaledValue': ':.2f', 'Cluster': True}
    )

    fig.update_traces(marker_line_width=0.5, marker_line_color="black", opacity=0.9)
    fig.update_layout(
        xaxis_tickangle=-45,
        yaxis_title='Scaled Value',
        legend_title='Cluster',
        uniformtext_minsize=8,
        uniformtext_mode='hide',
        bargap=0.4,
        bargroupgap=0.25
    )
    fig.show()

plot_cluster_comparison_interactive(
    kmeans_scaled, kmeans_summary,
    'K-Means Cluster Comparison (Scaled)',
    kmeans_names
)