# Customer Segmentation with KMeans

This notebook mirrors `kmeans_customer_segmentation.py` with exploratory analysis, model selection, clustering, and profiling.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

OUTPUT_DIR = "reports"
os.makedirs(OUTPUT_DIR, exist_ok=True)

df = pd.read_csv("Mall_Customers.csv")
df.head()

## Exploratory Analysis

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df["Annual Income (k$)"], bins=20, kde=True)
plt.title("Annual Income Distribution")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 4))
sns.histplot(df["Spending Score (1-100)"], bins=20, kde=True)
plt.title("Spending Score Distribution")
plt.xlabel("Spending Score (1-100)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

plt.figure(figsize=(6, 5))
sns.scatterplot(x="Annual Income (k$)", y="Spending Score (1-100)", data=df, s=60)
plt.title("Income vs Spending (Raw)")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.tight_layout()
plt.show()

## Scaling and Model Selection

In [None]:
feature_cols = ["Annual Income (k$)", "Spending Score (1-100)"]
X = df[feature_cols]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

wcss = []
silhouette_scores = []
cluster_range = range(2, 11)
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init="k-means++", n_init=10, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)
    if i in cluster_range:
        silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

plt.figure()
plt.plot(range(1, 11), wcss, marker="o")
plt.title("Elbow Method (WCSS)")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.tight_layout()
plt.show()

plt.figure()
plt.plot(list(cluster_range), silhouette_scores, marker="o")
plt.title("Silhouette Score by K")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.tight_layout()
plt.show()

## Clustering and Profiling

In [None]:
kmeans = KMeans(n_clusters=5, init="k-means++", n_init=10, random_state=42)
y_kmeans = kmeans.fit_predict(X_scaled)
df["Cluster"] = y_kmeans

plt.figure(figsize=(8, 6))
sns.scatterplot(
    x="Annual Income (k$)",
    y="Spending Score (1-100)",
    hue="Cluster",
    palette="Set2",
    data=df,
    s=100
)
plt.title("Customer Segments (KMeans)")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.legend()
plt.tight_layout()
plt.show()

cluster_profile = (
    df.groupby("Cluster")[feature_cols]
    .mean()
    .round(2)
    .reset_index()
)
cluster_counts = df["Cluster"].value_counts().sort_index().reset_index()
cluster_counts.columns = ["Cluster", "Count"]
cluster_profile = cluster_profile.merge(cluster_counts, on="Cluster", how="left")
cluster_profile

## Evaluation

In [None]:
sil_score = silhouette_score(X_scaled, y_kmeans)
db_index = davies_bouldin_score(X_scaled, y_kmeans)
ch_score = calinski_harabasz_score(X_scaled, y_kmeans)

metrics_df = pd.DataFrame(
    {
        "metric": ["silhouette", "davies_bouldin", "calinski_harabasz"],
        "value": [round(sil_score, 4), round(db_index, 4), round(ch_score, 4)],
    }
)
metrics_df