In [None]:

# import subprocess, sys

# def install(pkg):
#     try:
#         __import__(pkg.replace("-","_").replace("scikit_learn","sklearn"))
#     except ImportError:
#         subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])

# for p in ["pandas","numpy","matplotlib","seaborn","scikit-learn","scipy"]:
#     install(p)

import pandas as pd
import numpy as np
# import matplotlib as plt
# matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score, calinski_harabasz_score, davies_bouldin_score
)
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage


In [None]:

# ============================================================
# STEP 1: DOWNLOAD REAL-WORLD DATA
# ============================================================


url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00292/Wholesale%20customers%20data.csv"

df = pd.read_csv(url)
print(f" Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst 5 rows:")
print(df.head().to_string())



 Downloaded successfully!
 Shape: 440 rows × 8 columns

Columns: ['Channel', 'Region', 'Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']

First 5 rows:
   Channel  Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen
0        2       3  12669  9656     7561     214              2674        1338
1        2       3   7057  9810     9568    1762              3293        1776
2        2       3   6353  8808     7684    2405              3516        7844
3        1       3  13265  1196     4221    6404               507        1788
4        2       3  22615  5410     7198    3915              1777        5185


In [None]:

# ============================================================
# STEP 2: EXPLORATORY DATA ANALYSIS (EDA)
# ============================================================


# 2.1 Basic info
print(f"\n--- Data Types ---")
print(df.dtypes.to_string())

print(f"\n--- Missing Values ---")
print(df.isnull().sum().to_string())

print(f"\n--- Duplicates: {df.duplicated().sum()} ---")

print(f"\n--- Statistics ---")
print(df.describe().round(2).to_string())

# 2.2 Channel & Region distribution
print(f"\n--- Channel Distribution ---")
print(df["Channel"].value_counts().to_string())
print(f"\n--- Region Distribution ---")
print(df["Region"].value_counts().to_string())

# 2.3 Basic plots
spending_cols = ["Fresh", "Milk", "Grocery", "Frozen",
                 "Detergents_Paper", "Delicassen"]

# Histograms
fig, axes = plt.subplots(2, 3, figsize=(14, 8))
axes = axes.flatten()
for i, col in enumerate(spending_cols):
    axes[i].hist(df[col], bins=25, color="steelblue", edgecolor="black")
    axes[i].set_title(col)
    axes[i].set_xlabel("Spending")
    axes[i].set_ylabel("Count")
plt.suptitle("Spending Distribution (Before Cleaning)", fontweight="bold")
plt.tight_layout()
plt.savefig("01_distributions.png", dpi=100)
plt.show()



# Boxplots
fig, axes = plt.subplots(2, 3, figsize=(14, 8))
axes = axes.flatten()
for i, col in enumerate(spending_cols):
    axes[i].boxplot(df[col], vert=True)
    axes[i].set_title(col)
    axes[i].set_ylabel("Spending")
plt.suptitle("Boxplots — Outlier Check", fontweight="bold")
plt.tight_layout()
plt.savefig("02_boxplots.png", dpi=100)
plt.show()



# Correlation
print(f"\n--- Correlation Matrix ---")
print(df[spending_cols].corr().round(2).to_string())


--- Data Types ---
Channel             int64
Region              int64
Fresh               int64
Milk                int64
Grocery             int64
Frozen              int64
Detergents_Paper    int64
Delicassen          int64

--- Missing Values ---
Channel             0
Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0

--- Duplicates: 0 ---

--- Statistics ---
       Channel  Region      Fresh      Milk   Grocery    Frozen  Detergents_Paper  Delicassen
count   440.00  440.00     440.00    440.00    440.00    440.00            440.00      440.00
mean      1.32    2.54   12000.30   5796.27   7951.28   3071.93           2881.49     1524.87
std       0.47    0.77   12647.33   7380.38   9503.16   4854.67           4767.85     2820.11
min       1.00    1.00       3.00     55.00      3.00     25.00              3.00        3.00
25%       1.00    2.00    3127.75   1533.00   2153.00    74

In [None]:

# ============================================================
# STEP 3: DATA CLEANING
# ============================================================

df_clean = df.copy()

# 3.1 Drop Channel & Region (we want unsupervised — no labels)
# But keep them aside for later verification
labels_channel = df_clean["Channel"].copy()
labels_region = df_clean["Region"].copy()

df_clean = df_clean.drop(columns=["Channel", "Region"])
print(" 1. Dropped Channel & Region (keeping for verification)")

# 3.2 Check & remove duplicates
before = len(df_clean)
df_clean = df_clean.drop_duplicates().reset_index(drop=True)
labels_channel = labels_channel[:len(df_clean)]
labels_region = labels_region[:len(df_clean)]
after = len(df_clean)
print(f" 2. Removed {before - after} duplicates")

# 3.3 Handle missing values
missing = df_clean.isnull().sum().sum()
print(f" 3. Missing values: {missing}")

# 3.4 Handle outliers using IQR capping
print(f"\n--- Outlier Capping (IQR method) ---")
for col in df_clean.columns:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers_count = ((df_clean[col] < lower) | (df_clean[col] > upper)).sum()
    df_clean[col] = df_clean[col].clip(lower=lower, upper=upper)
    print(f"   {col:>20s}: {outliers_count:>3d} outliers capped")

print(f"\n 4. All outliers capped using IQR")

# 3.5 Log transformation (spending data is skewed)
df_log = df_clean.copy()
for col in df_clean.columns:
    df_log[col] = np.log1p(df_clean[col])
print(" 5. Log transformation applied (skewness fix)")

# 3.6 Feature scaling
scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df_log),
    columns=df_log.columns
)
print(" 6. StandardScaler applied")

print(f"\n--- Cleaned & Scaled Data (first 5 rows) ---")
print(df_scaled.head().round(3).to_string())
print(f"\nFinal shape: {df_scaled.shape}")

In [None]:

# ============================================================
# STEP 4: FIND OPTIMAL CLUSTERS
# ============================================================



# 4.1 Dendrogram
print("\n--- Generating Dendrogram ---")
linkage_matrix = linkage(df_scaled, method="ward", metric="euclidean")

plt.figure(figsize=(14, 7))
dendrogram(
    linkage_matrix,
    truncate_mode="lastp",
    p=30,
    leaf_rotation=90,
    leaf_font_size=10,
    show_contracted=True
)
plt.title("Dendrogram (Ward Linkage)", fontweight="bold", fontsize=14)
plt.xlabel("Sample Index / Cluster Size")
plt.ylabel("Distance")
plt.axhline(y=10, color="red", linestyle="--", label="Cut at distance=10")
plt.axhline(y=15, color="orange", linestyle="--", label="Cut at distance=15")
plt.legend()
plt.tight_layout()
plt.savefig("03_dendrogram.png", dpi=100)
plt.show()
print(" Dendrogram saved")

# 4.2 Silhouette Score for different K values
print("\n--- Silhouette Scores for K=2 to K=10 ---")
sil_scores = {}

for k in range(2, 11):
    agg = AgglomerativeClustering(n_clusters=k, linkage="ward")
    labels = agg.fit_predict(df_scaled)
    sil = silhouette_score(df_scaled, labels)
    ch = calinski_harabasz_score(df_scaled, labels)
    db = davies_bouldin_score(df_scaled, labels)
    sil_scores[k] = {"Silhouette": sil, "Calinski": ch, "Davies_Bouldin": db}
    print(f"   K={k:>2d}  |  Silhouette: {sil:.4f}  |  "
          f"Calinski-Harabasz: {ch:.1f}  |  Davies-Bouldin: {db:.4f}")

# Plot silhouette scores
ks = list(sil_scores.keys())
sils = [sil_scores[k]["Silhouette"] for k in ks]

plt.figure(figsize=(10, 5))
plt.plot(ks, sils, "bo-", linewidth=2, markersize=8)
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score vs K", fontweight="bold")
plt.xticks(ks)
for k, s in zip(ks, sils):
    plt.annotate(f"{s:.3f}", (k, s), textcoords="offset points",
                 xytext=(0, 10), ha="center")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("04_silhouette_scores.png", dpi=100)
plt.show()

# Find best K
best_k = max(sil_scores, key=lambda k: sil_scores[k]["Silhouette"])
print(f"\n Best K = {best_k} (Silhouette = {sil_scores[best_k]['Silhouette']:.4f})")

In [None]:

# ============================================================
# STEP 5: FIT WITH DIFFERENT LINKAGES & COMPARE
# ============================================================



linkages = ["ward", "complete", "average", "single"]
linkage_results = []

print(f"\n{'Linkage':<12} {'Silhouette':>12} {'Calinski-H':>12} "
      f"{'Davies-B':>12} {'Status':>10}")
print("-" * 65)

best_model = None
best_labels = None
best_score = -1
best_linkage = ""

for link in linkages:
    try:
        if link == "ward":
            agg = AgglomerativeClustering(
                n_clusters=best_k, linkage=link
            )
        else:
            agg = AgglomerativeClustering(
                n_clusters=best_k, linkage=link
            )

        cluster_labels = agg.fit_predict(df_scaled)

        sil = silhouette_score(df_scaled, cluster_labels)
        ch = calinski_harabasz_score(df_scaled, cluster_labels)
        db = davies_bouldin_score(df_scaled, cluster_labels)

        linkage_results.append({
            "Linkage": link,
            "Silhouette": round(sil, 4),
            "Calinski_Harabasz": round(ch, 2),
            "Davies_Bouldin": round(db, 4),
            "Labels": cluster_labels
        })

        status = ""
        if sil > best_score:
            best_score = sil
            best_model = agg
            best_labels = cluster_labels
            best_linkage = link
            status = " BEST"

        print(f"{link:<12} {sil:>12.4f} {ch:>12.1f} {db:>12.4f} {status:>10}")

    except Exception as e:
        print(f"{link:<12} {'ERROR':>12} — {str(e)}")

print("-" * 65)
print(f"\n Best Linkage: {best_linkage.upper()}")
print(f"   Silhouette Score: {best_score:.4f}")
print(f"   Number of Clusters: {best_k}")

# Compare linkages visually
results_df = pd.DataFrame(linkage_results).drop(columns=["Labels"])
print(f"\n--- Linkage Comparison Table ---")
print(results_df.to_string(index=False))


In [None]:

# ============================================================
# STEP 6: ASSIGN CLUSTERS & ANALYZE
# ============================================================



# Add cluster labels to original cleaned data
df_result = df_clean.copy()
df_result["Cluster"] = best_labels

# 6.1 Cluster sizes
print(f"\n--- Cluster Sizes ---")
cluster_counts = df_result["Cluster"].value_counts().sort_index()
for cl, cnt in cluster_counts.items():
    print(f"   Cluster {cl}: {cnt} customers ({cnt/len(df_result)*100:.1f}%)")

# 6.2 Cluster means (spending profile)
print(f"\n--- Cluster Spending Profiles (Mean) ---")
cluster_means = df_result.groupby("Cluster").mean().round(1)
print(cluster_means.to_string())

# 6.3 Cluster means (median)
print(f"\n--- Cluster Spending Profiles (Median) ---")
cluster_medians = df_result.groupby("Cluster").median().round(1)
print(cluster_medians.to_string())

# 6.4 Describe each cluster
print(f"\n--- Cluster Descriptions ---")
overall_mean = df_clean.mean()
for cl in sorted(df_result["Cluster"].unique()):
    cluster_data = df_result[df_result["Cluster"] == cl]
    cl_mean = cluster_data[spending_cols].mean()
    print(f"\n   CLUSTER {cl} ({len(cluster_data)} customers):")

    high_features = []
    low_features = []
    for col in spending_cols:
        if cl_mean[col] > overall_mean[col] * 1.2:
            high_features.append(col)
        elif cl_mean[col] < overall_mean[col] * 0.8:
            low_features.append(col)

    if high_features:
        print(f"     HIGH spending: {', '.join(high_features)}")
    if low_features:
        print(f"     LOW  spending: {', '.join(low_features)}")


In [None]:

# ============================================================
# STEP 7: VISUALIZATIONS (Simple & Basic)
# ============================================================


# 7.1 PCA for 2D visualization
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)

colors = ["red", "blue", "green", "orange", "purple",
          "brown", "pink", "gray", "cyan", "magenta"]

plt.figure(figsize=(10, 7))
for cl in sorted(df_result["Cluster"].unique()):
    mask = best_labels == cl
    plt.scatter(df_pca[mask, 0], df_pca[mask, 1],
                c=colors[cl], label=f"Cluster {cl}",
                s=50, edgecolors="black", linewidths=0.5, alpha=0.7)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title(f"Agglomerative Clustering (K={best_k}, {best_linkage})",
          fontweight="bold")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("05_clusters_pca.png", dpi=100)
plt.show()

# 7.2 Cluster size bar chart
plt.figure(figsize=(8, 5))
bars = plt.bar(cluster_counts.index, cluster_counts.values,
               color=[colors[i] for i in cluster_counts.index],
               edgecolor="black")
plt.xlabel("Cluster")
plt.ylabel("Number of Customers")
plt.title("Cluster Sizes", fontweight="bold")
plt.xticks(cluster_counts.index)
for bar, val in zip(bars, cluster_counts.values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2,
             str(val), ha="center", fontweight="bold")
plt.tight_layout()
plt.savefig("06_cluster_sizes.png", dpi=100)
plt.show()

# 7.3 Cluster spending profile (grouped bar chart)
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for i, col in enumerate(spending_cols):
    cl_means = df_result.groupby("Cluster")[col].mean()
    axes[i].bar(cl_means.index, cl_means.values,
                color=[colors[c] for c in cl_means.index],
                edgecolor="black")
    axes[i].set_title(f"{col}", fontweight="bold")
    axes[i].set_xlabel("Cluster")
    axes[i].set_ylabel("Mean Spending")
    axes[i].set_xticks(cl_means.index)

plt.suptitle("Average Spending per Cluster", fontweight="bold", fontsize=14)
plt.tight_layout()
plt.savefig("07_spending_profiles.png", dpi=100)
plt.show()

# 7.4 Compare with actual Channel labels
print(f"\n--- Cluster vs Actual Channel (Verification) ---")
comparison = pd.crosstab(
    best_labels[:len(labels_channel)],
    labels_channel[:len(best_labels)],
    rownames=["Cluster"],
    colnames=["Channel"]
)
print(comparison.to_string())

plt.figure(figsize=(8, 5))
comparison.plot(kind="bar", edgecolor="black")
plt.title("Clusters vs Actual Channel", fontweight="bold")
plt.xlabel("Cluster")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.legend(title="Channel", labels=["Hotel/Restaurant", "Retail"])
plt.tight_layout()
plt.savefig("08_cluster_vs_channel.png", dpi=100)
plt.show()

# 7.5 Linkage comparison bar chart
plt.figure(figsize=(10, 5))
link_names = [r["Linkage"] for r in linkage_results]
link_sils = [r["Silhouette"] for r in linkage_results]
bar_colors = ["green" if l == best_linkage else "gray" for l in link_names]

bars = plt.bar(link_names, link_sils, color=bar_colors, edgecolor="black")
plt.xlabel("Linkage Method")
plt.ylabel("Silhouette Score")
plt.title("Linkage Comparison", fontweight="bold")
for bar, val in zip(bars, link_sils):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
             f"{val:.4f}", ha="center", fontweight="bold")
plt.tight_layout()
plt.savefig("09_linkage_comparison.png", dpi=100)
plt.show()

print(" All plots saved!")


In [None]:

# ============================================================
# STEP 8: PREDICT NEW DATA
# ============================================================



def predict_new_customer(fresh, milk, grocery, frozen,
                         detergents_paper, delicassen):
    """Predict cluster for a new customer."""

    new_data = pd.DataFrame([{
        "Fresh": fresh,
        "Milk": milk,
        "Grocery": grocery,
        "Frozen": frozen,
        "Detergents_Paper": detergents_paper,
        "Delicassen": delicassen
    }])

    # Apply same transformations
    # 1. IQR capping (using training stats)
    for col in new_data.columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        new_data[col] = new_data[col].clip(lower=lower, upper=upper)

    # 2. Log transform
    new_data_log = new_data.apply(np.log1p)

    # 3. Scale
    new_data_scaled = pd.DataFrame(
        scaler.transform(new_data_log),
        columns=new_data_log.columns
    )

    # 4. Find nearest cluster using centroid distance
    centroids = df_scaled.copy()
    centroids["Cluster"] = best_labels
    cluster_centers = centroids.groupby("Cluster").mean()

    distances = {}
    for cl in cluster_centers.index:
        center = cluster_centers.loc[cl].values
        point = new_data_scaled.values[0]
        dist = np.sqrt(np.sum((point - center) ** 2))
        distances[cl] = round(dist, 4)

    predicted_cluster = min(distances, key=distances.get)

    return predicted_cluster, distances


# Example predictions
print("\n--- Example Prediction 1: High Fresh Spender ---")
cluster, dists = predict_new_customer(
    fresh=20000, milk=3000, grocery=4000,
    frozen=2000, detergents_paper=500, delicassen=1000
)
print(f"   Input: Fresh=20000, Milk=3000, Grocery=4000, "
      f"Frozen=2000, Detergents=500, Delicassen=1000")
print(f"   Distances: {dists}")
print(f"    Predicted Cluster: {cluster}")

print("\n--- Example Prediction 2: High Grocery Spender ---")
cluster, dists = predict_new_customer(
    fresh=5000, milk=15000, grocery=25000,
    frozen=1000, detergents_paper=12000, delicassen=2000
)
print(f"   Input: Fresh=5000, Milk=15000, Grocery=25000, "
      f"Frozen=1000, Detergents=12000, Delicassen=2000")
print(f"   Distances: {dists}")
print(f"    Predicted Cluster: {cluster}")

print("\n--- Example Prediction 3: Low Spender ---")
cluster, dists = predict_new_customer(
    fresh=3000, milk=2000, grocery=2000,
    frozen=500, detergents_paper=300, delicassen=400
)
print(f"   Input: Fresh=3000, Milk=2000, Grocery=2000, "
      f"Frozen=500, Detergents=300, Delicassen=400")
print(f"   Distances: {dists}")
print(f"    Predicted Cluster: {cluster}")


In [None]:

# ============================================================
# STEP 9: COMPLETE EVALUATION SUMMARY
# ============================================================


final_sil = silhouette_score(df_scaled, best_labels)
final_ch = calinski_harabasz_score(df_scaled, best_labels)
final_db = davies_bouldin_score(df_scaled, best_labels)



for cl in sorted(df_result["Cluster"].unique()):
    cnt = len(df_result[df_result["Cluster"] == cl])
    pct = cnt / len(df_result) * 100
    print(f"  Cluster {cl}: {cnt:>4d} customers ({pct:>5.1f}%) ")



for r in linkage_results:
    marker = " " if r["Linkage"] == best_linkage else "   "
    print(f"  {r['Linkage']:<10s}: Silhouette={r['Silhouette']:.4f}{marker} ")


In [None]:

# ============================================================
# STEP 10: SAVE RESULTS
# ============================================================

print("\n" + "=" * 60)
print(" STEP 10: SAVE RESULTS")
print("=" * 60)

# Save clustered data
df_result.to_csv("clustered_customers.csv", index=False)
print(" Saved: clustered_customers.csv")

# Save cluster profiles
cluster_means.to_csv("cluster_profiles.csv")
print(" Saved: cluster_profiles.csv")

# Save evaluation
eval_df = pd.DataFrame(linkage_results).drop(columns=["Labels"])
eval_df.to_csv("evaluation_results.csv", index=False)

print("   01_distributions.png")
print("   02_boxplots.png")
print("   03_dendrogram.png")
print("   04_silhouette_scores.png")
print("   05_clusters_pca.png")
print("   06_cluster_sizes.png")
print("   07_spending_profiles.png")
print("   08_cluster_vs_channel.png")
print("   09_linkage_comparison.png")