In [None]:
# =========================================================
# Mall Customers — EDA Dashboard + K-Means Clustering
# =========================================================
# Author: <your name>
# Goal: Clean EDA dashboard (matplotlib/seaborn) + clustering baseline
# =========================================================

# -----------------------------
# 0. Imports & Settings
# -----------------------------
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

sns.set(style="whitegrid", context="talk")
plt.rcParams["figure.dpi"] = 120

# -----------------------------
# 1. Load Data
# -----------------------------
# Ensure the CSV is in your working directory
df = pd.read_csv("Mall_Customers.csv")
print("Shape:", df.shape)
display(df.head())

# -----------------------------
# 2. Quick Data Audit
# -----------------------------
print("\n--- Info ---")
display(df.info())
print("\n--- Missing values ---")
display(df.isna().sum())

numeric_cols = ["Age", "Annual Income (k$)", "Spending Score (1-100)"]
cat_cols = ["Gender"]

print("\n--- Numeric Summary ---")
display(df[numeric_cols].describe())

print("\n--- Categorical Summary ---")
for c in cat_cols:
    display(df[c].value_counts())

# -----------------------------
# 3. Univariate EDA (Subplots)
# -----------------------------
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(df["Age"], bins=20, kde=True, ax=axes[0])
axes[0].set_title("Age Distribution")

sns.countplot(x="Gender", data=df, ax=axes[1])
axes[1].set_title("Gender Count")

sns.histplot(df["Annual Income (k$)"], bins=20, kde=True, ax=axes[2])
axes[2].set_title("Annual Income Distribution")
plt.tight_layout()
plt.show()

# Spending Score alone
plt.figure(figsize=(6,4))
sns.histplot(df["Spending Score (1-100)"], bins=20, kde=True)
plt.title("Spending Score Distribution")
plt.tight_layout()
plt.show()

# -----------------------------
# 4. Bivariate EDA
# -----------------------------
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.scatterplot(
    x="Annual Income (k$)", y="Spending Score (1-100)",
    hue="Gender", data=df, ax=axes[0]
)
axes[0].set_title("Income vs Spending Score")

sns.boxplot(
    x="Gender", y="Spending Score (1-100)",
    data=df, ax=axes[1]
)
axes[1].set_title("Spending Score by Gender")

plt.tight_layout()
plt.show()

# Optional: Age vs Spending Score
plt.figure(figsize=(6,5))
sns.scatterplot(
    x="Age", y="Spending Score (1-100)",
    hue="Gender", data=df
)
plt.title("Age vs Spending Score")
plt.tight_layout()
plt.show()

# -----------------------------
# 5. Correlation Heatmap
# -----------------------------
plt.figure(figsize=(6,4))
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="Blues")
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# -----------------------------
# 6. K-Means Clustering (Baseline)
# -----------------------------
# Choose features (classic 2D: Income & Spending Score). Add Age later if desired.
features = df[["Annual Income (k$)", "Spending Score (1-100)"]].copy()

# Scale (important for distance-based methods)
scaler = StandardScaler()
X = scaler.fit_transform(features)

# Search k using both inertia (elbow) and silhouette
ks = list(range(2, 11))
inertias = []
silhouettes = []

for k in ks:
    km = KMeans(n_clusters=k, n_init="auto", random_state=42)
    labels = km.fit_predict(X)
    inertias.append(km.inertia_)
    sil_score = silhouette_score(X, labels)
    silhouettes.append(sil_score)

# Plot elbow
plt.figure(figsize=(6,4))
plt.plot(ks, inertias, marker="o")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.title("Elbow Plot (Inertia vs k)")
plt.tight_layout()
plt.show()

# Plot silhouette
plt.figure(figsize=(6,4))
plt.plot(ks, silhouettes, marker="o")
plt.xlabel("k")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score vs k")
plt.tight_layout()
plt.show()

# Pick best k by silhouette peak (you can override manually)
best_k = ks[int(np.argmax(silhouettes))]
print(f"Selected k (silhouette peak): {best_k}")

# Fit final model
kmeans = KMeans(n_clusters=best_k, n_init="auto", random_state=42)
cluster_labels = kmeans.fit_predict(X)

# Attach results back to df
df["Cluster"] = cluster_labels
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
centroids_df = pd.DataFrame(centroids, columns=features.columns)
display(centroids_df)

print("\n--- Cluster Counts ---")
display(df["Cluster"].value_counts().sort_index())

print("\n--- Cluster Feature Means ---")
display(df.groupby("Cluster")[["Age", "Annual Income (k$)", "Spending Score (1-100)"]].mean().round(2))

# 2D visualization of clusters (Income vs Spending Score)
plt.figure(figsize=(7,6))
sns.scatterplot(
    x="Annual Income (k$)", y="Spending Score (1-100)",
    hue="Cluster", palette="tab10", data=df, s=70
)
plt.scatter(
    centroids_df["Annual Income (k$)"], centroids_df["Spending Score (1-100)"],
    s=250, marker="X", edgecolor="black", label="Centroid"
)
plt.title(f"K-Means Clusters (k={best_k})")
plt.legend()
plt.tight_layout()
plt.show()

# -----------------------------
# 7. (Optional) Add Age as 3rd feature and compare
# -----------------------------
features3 = df[["Age", "Annual Income (k$)", "Spending Score (1-100)"]]
X3 = StandardScaler().fit_transform(features3)

sil3 = []
for k in ks:
    km3 = KMeans(n_clusters=k, n_init="auto", random_state=42)
    labs3 = km3.fit_predict(X3)
    sil3.append(silhouette_score(X3, labs3))

plt.figure(figsize=(6,4))
plt.plot(ks, sil3, marker="o")
plt.xlabel("k")
plt.ylabel("Silhouette Score (3 features)")
plt.title("Silhouette Score vs k (Age + Income + Score)")
plt.tight_layout()
plt.show()

# -----------------------------
# 8. Save artifacts
# -----------------------------
df.to_csv("mall_customers_with_clusters.csv", index=False)
centroids_df.to_csv("cluster_centroids.csv", index=False)
print("Saved: mall_customers_with_clusters.csv, cluster_centroids.csv")
