In [None]:
# %% [markdown]
# # Customer Segmentation Project
# ## Notebook 03: K-Means Clustering
#
# This notebook applies K-Means clustering to identify customer segments based on RFM metrics.

In [None]:
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import project modules
import sys
sys.path.append('../src')

from utils import save_plot
from clustering import (
    prepare_rfm_for_clustering,
    find_optimal_clusters,
    apply_kmeans,
    visualize_clusters
)

In [None]:
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
plt.style.use('seaborn-v0_8-darkgrid')

In [None]:
# %% [markdown]
# ### 1. Load RFM Data

In [None]:
# %%
rfm_table = pd.read_csv('../data/processed/rfm_table.csv')

In [None]:
print("=== RFM Data Loaded ===")
print(f"Shape: {rfm_table.shape}")
print(f"\nColumns: {rfm_table.columns.tolist()}")
print(f"\nData Types:\n{rfm_table.dtypes}")
print(f"\nFirst 5 rows:")
display(rfm_table.head())

In [None]:
# Check for missing values
missing = rfm_table.isnull().sum()

if missing.sum() > 0:
    print(f"\nMissing values found:\n{missing[missing > 0]}")
    rfm_table = rfm_table.dropna()
    print("Missing values removed.")

In [None]:
# %% [markdown]
# ### 2. Prepare Data for Clustering

In [None]:
# %%
scaled_features, scaler, rfm_log = prepare_rfm_for_clustering(
    rfm_table,
    columns=['recency', 'frequency', 'monetary']
)

In [None]:
print("=== Data Preparation Complete ===")
print(f"Original RFM shape: {rfm_table.shape}")
print(f"Scaled features shape: {scaled_features.shape}")
print("\nSample scaled features (first 5 rows):")
display(pd.DataFrame(
    scaled_features[:5],
    columns=['recency', 'frequency', 'monetary']
))

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, col in enumerate(['recency', 'frequency', 'monetary']):
    axes[idx].hist(rfm_log[col], bins=20, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Log-transformed {col.capitalize()}', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
save_plot(fig, 'log_transformed_distributions.png')
plt.show()

In [None]:
# %% [markdown]
# ### 3. Determine Optimal Number of Clusters

In [None]:
# %%
inertia_values, optimal_k, fig = find_optimal_clusters(
    scaled_features,
    max_k=10
)

In [None]:
print("\n=== Optimal Clusters Analysis ===")
print(f"Suggested optimal k: {optimal_k}")

In [None]:
inertia_df = pd.DataFrame({
    'k': range(1, len(inertia_values) + 1),
    'inertia': inertia_values
})

print("\nInertia values:")
display(inertia_df)

plt.show()

In [None]:
# %% [markdown]
# ### 4. Apply K-Means Clustering

In [None]:
# %%
kmeans_model, cluster_labels, cluster_centers = apply_kmeans(
    scaled_features,
    n_clusters=optimal_k,
    random_state=42
)

In [None]:
rfm_table['cluster'] = cluster_labels
rfm_table['cluster'] = rfm_table['cluster'].astype('category')

In [None]:
print("\n=== Clustering Complete ===")
print(f"Number of clusters: {optimal_k}")

cluster_distribution = rfm_table['cluster'].value_counts().sort_index()
print("Cluster distribution:")
print(cluster_distribution)

In [None]:
cluster_percentage = (cluster_distribution / len(rfm_table) * 100).round(1)

cluster_summary = pd.DataFrame({
    'count': cluster_distribution,
    'percentage': cluster_percentage
})

print("\nCluster summary:")
display(cluster_summary)

In [None]:
# %% [markdown]
# ### 5. Visualize Clusters

In [None]:
# %%
fig, pca = visualize_clusters(
    pd.DataFrame(
        scaled_features,
        columns=['recency', 'frequency', 'monetary']
    ),
    cluster_labels,
    cluster_centers
)

plt.show()

In [None]:
# %% [markdown]
# ### 6. Analyze Cluster Characteristics

In [None]:
# %%
print("=== Cluster Characteristics ===")

In [None]:
cluster_characteristics = rfm_table.groupby('cluster').agg({
    'recency': ['mean', 'std', 'min', 'max'],
    'frequency': ['mean', 'std', 'min', 'max'],
    'monetary': ['mean', 'std', 'min', 'max'],
    'customer_id': 'count'
}).round(2)

In [None]:
cluster_characteristics.columns = [
    'recency_mean', 'recency_std', 'recency_min', 'recency_max',
    'frequency_mean', 'frequency_std', 'frequency_min', 'frequency_max',
    'monetary_mean', 'monetary_std', 'monetary_min', 'monetary_max',
    'customer_count'
]

In [None]:
cluster_characteristics['percentage'] = (
    cluster_characteristics['customer_count'] /
    cluster_characteristics['customer_count'].sum() * 100
).round(1)

display(cluster_characteristics)

In [None]:
cluster_centers_df = pd.DataFrame(
    cluster_centers,
    columns=['recency', 'frequency', 'monetary'],
    index=[f'Cluster {i}' for i in range(optimal_k)]
)

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(
    cluster_centers_df.T,
    annot=True,
    cmap='YlOrRd',
    center=0,
    fmt='.2f',
    linewidths=1,
    cbar_kws={"shrink": 0.8}
)
plt.title('Cluster Centers (Scaled Features)', fontsize=14, fontweight='bold')
plt.tight_layout()
save_plot(plt.gcf(), 'cluster_centers_heatmap.png')
plt.show()

In [None]:
# %% [markdown]
# ### 7. Compare RFM Scores with Clusters

In [None]:
# %%
if 'rfm_segment' in rfm_table.columns:
    print("\n=== Cross-tabulation: RFM Segments vs K-Means Clusters ===")

In [None]:
cross_tab = pd.crosstab(
        rfm_table['rfm_segment'],
        rfm_table['cluster'],
        margins=True,
        margins_name='Total'
    )
    display(cross_tab)

In [None]:
plt.figure(figsize=(12, 8))

    cross_tab_percentage = pd.crosstab(
        rfm_table['rfm_segment'],
        rfm_table['cluster'],
        normalize='index'
    ) * 100

In [None]:
sns.heatmap(
        cross_tab_percentage,
        annot=True,
        fmt='.1f',
        cmap='Blues',
        cbar_kws={'label': 'Percentage (%)'}
    )
    plt.title(
        'RFM Segments Distribution Across K-Means Clusters',
        fontsize=14,
        fontweight='bold'
    )
    plt.xlabel('K-Means Cluster')
    plt.ylabel('RFM Segment')
    plt.tight_layout()
    save_plot(plt.gcf(), 'rfm_vs_clusters_heatmap.png')
    plt.show()

In [None]:
# %% [markdown]
# ### 8. Export Clustered Data

In [None]:
# %%
rfm_table.to_csv('../data/processed/customer_segments.csv', index=False)

In [None]:
print("\n=== Clustered Data Exported ===")
print("File saved: ../data/processed/customer_segments.csv")
print(f"Rows: {rfm_table.shape[0]}, Columns: {rfm_table.shape[1]}")
print(f"\nFinal columns: {rfm_table.columns.tolist()}")
print("\nSample of final data:")
display(rfm_table.head())