In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler

# Step 1: Data Pre-processing
# Load the dataset (ensure the CSV file is in your working directory)
df = pd.read_csv("Mall_Customers.csv")

# Display first few rows
print("Data Head:")
print(df.head())

# For clustering, we focus on features 'Annual Income (k$)' and 'Spending Score (1-100)'
# It might be useful to scale these features.
features = df[['Annual Income (k$)', 'Spending Score (1-100)']]

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Step 2: Data Preparation (Train-Test Split)
# Note: In unsupervised learning, a split isn’t always needed. Here we use it to simulate a validation process.
X_train, X_test = train_test_split(scaled_features, test_size=0.3, random_state=42)
print("\nTrain and Test shapes:", X_train.shape, X_test.shape)

# Step 3: Apply Machine Learning Algorithms (Clustering)

# Define number of clusters
n_clusters = 5

# 3a. KMeans Clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train)
kmeans_labels_train = kmeans.labels_
kmeans_labels_test = kmeans.predict(X_test)

# 3b. Agglomerative (Hierarchical) Clustering
agglo = AgglomerativeClustering(n_clusters=n_clusters)
agglo_labels_train = agglo.fit_predict(X_train)
# Note: AgglomerativeClustering does not have a predict() method. For evaluation on test data, we refit on the full dataset.
agglo_full = AgglomerativeClustering(n_clusters=n_clusters)
agglo_full_labels = agglo_full.fit_predict(scaled_features)
# For demonstration, we take the test labels from the full clustering:
agglo_labels_test = agglo_full_labels[np.array(pd.Series(X_train.shape[0]).index)[:X_test.shape[0]]]

# Step 4: Evaluate the Models using Silhouette Score
# Evaluate on training data
silhouette_kmeans = silhouette_score(X_train, kmeans_labels_train)
silhouette_agglo = silhouette_score(X_train, agglo_labels_train)

print("\nSilhouette Score on Training Data:")
print("KMeans:", silhouette_kmeans)
print("Agglomerative Clustering:", silhouette_agglo)

# Evaluate on the full dataset (for agglomerative we already have full clustering)
silhouette_kmeans_full = silhouette_score(scaled_features, kmeans.predict(scaled_features))
silhouette_agglo_full = silhouette_score(scaled_features, agglo_full_labels)

print("\nSilhouette Score on Full Data:")
print("KMeans:", silhouette_kmeans_full)
print("Agglomerative Clustering:", silhouette_agglo_full)

# Step 5: Cross-Validation (Repeated Splitting) for Clustering Evaluation
# Since clustering is unsupervised, we use K-Fold cross-validation on the silhouette score.
kf = KFold(n_splits=5, shuffle=True, random_state=42)
silhouette_kmeans_cv = []
silhouette_agglo_cv = []

for train_index, test_index in kf.split(scaled_features):
    X_cv_train = scaled_features[train_index]
    # KMeans clustering
    kmeans_cv = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans_cv.fit(X_cv_train)
    labels_cv = kmeans_cv.labels_
    silhouette_kmeans_cv.append(silhouette_score(X_cv_train, labels_cv))
    
    # Agglomerative Clustering
    agglo_cv = AgglomerativeClustering(n_clusters=n_clusters)
    labels_cv_agglo = agglo_cv.fit_predict(X_cv_train)
    silhouette_agglo_cv.append(silhouette_score(X_cv_train, labels_cv_agglo))

print("\nCross-Validation Silhouette Scores (mean ± std):")
print("KMeans: {:.3f} ± {:.3f}".format(np.mean(silhouette_kmeans_cv), np.std(silhouette_kmeans_cv)))
print("Agglomerative Clustering: {:.3f} ± {:.3f}".format(np.mean(silhouette_agglo_cv), np.std(silhouette_agglo_cv)))

# Optional: Visualize KMeans Clusters on the Full Dataset
plt.figure(figsize=(8, 5))
plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=kmeans.predict(scaled_features), cmap='viridis', alpha=0.6)
plt.title("KMeans Clusters")
plt.xlabel("Annual Income (scaled)")
plt.ylabel("Spending Score (scaled)")
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'Mall_Customers.csv'