Wholesale Customers Dataset


In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

data = pd.read_csv('Wholesale_customers_data.csv')

print(data.isnull().sum())

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(scaled_data)
labels = kmeans.labels_

data['Cluster'] = labels

plt.figure(figsize=(10, 6))
sns.scatterplot(x=scaled_data[:, 0], y=scaled_data[:, 1], hue=labels, palette='viridis')
plt.title('K-Means Clustering (k=3)')
plt.show()

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

silhouette_scores = []
for i in range(2, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    labels = kmeans.fit_predict(scaled_data)
    score = silhouette_score(scaled_data, labels)
    silhouette_scores.append(score)

plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.title('Silhouette Scores')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()

centroids = kmeans.cluster_centers_

centroids_original = scaler.inverse_transform(centroids)

cluster_centers = pd.DataFrame(centroids_original, columns=data.columns[:-1])
print(cluster_centers)

data.groupby('Cluster').mean().plot(kind='bar', figsize=(12, 8))
plt.title('Cluster Profiles based on Mean Spending')
plt.ylabel('Mean Spending')
plt.show()

linked = linkage(scaled_data, method='ward')

plt.figure(figsize=(12, 8))
dendrogram(linked, orientation='top', distance_sort='ascending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

hc_labels = fcluster(linked, t=3, criterion='maxclust')

data['HC_Cluster'] = hc_labels

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(x=scaled_data[:, 0], y=scaled_data[:, 1], hue=labels, palette='viridis')
plt.title('K-Means Clustering')

plt.subplot(1, 2, 2)
sns.scatterplot(x=scaled_data[:, 0], y=scaled_data[:, 1], hue=hc_labels, palette='viridis')
plt.title('Hierarchical Clustering')

plt.show()

kmeans_silhouette = silhouette_score(scaled_data, labels)
hc_silhouette = silhouette_score(scaled_data, hc_labels)

print(f"K-Means Silhouette Score: {kmeans_silhouette}")
print(f"Hierarchical Clustering Silhouette Score: {hc_silhouette}")
