
Implementing K-Means Clustering on Customer Segments
Task: Apply K-Means clustering to the Mall Customers dataset to segment customers based on their annual income and spending score. Visualize the resulting clusters.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('Mall_Customers.csv')

print(data.head())

X = data[['Annual Income (k$)', 'Spending Score (1-100)']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=5, n_init=10, random_state=42)  
kmeans_clusters = kmeans.fit_predict(X_scaled)

data['Cluster'] = kmeans_clusters

plt.figure(figsize=(10, 7))

colors = ['red', 'blue', 'green', 'purple', 'orange']

for i in range(5):]
    plt.scatter(X[data['Cluster'] == i]['Annual Income (k$)'],
                X[data['Cluster'] == i]['Spending Score (1-100)'],
                s=100, c=colors[i], label=f'Cluster {i}')

centroids = kmeans.cluster_centers_
centroids = scaler.inverse_transform(centroids)  
plt.scatter(centroids[:, 0], centroids[:, 1], s=300, c='black', marker='x', label='Centroids')

plt.title('Customer Segments (K-Means Clustering)')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

Optimal Number of Clusters: Elbow Method and Silhouette Score
Task: Use the Elbow Method and Silhouette Score to find the optimal number of clusters for the Mall Customers dataset. Discuss the criteria for selecting the number of clusters.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

data = pd.read_csv('Mall_Customers.csv')

X = data[['Annual Income (k$)', 'Spending Score (1-100)']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

inertia = []
k_range = range(1, 11) 

for k in k_range:
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, marker='o')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.xticks(k_range)
plt.grid(True)
plt.show()

silhouette_scores = []

for k in k_range[1:]:  # Start from 2 to avoid the single cluster case
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    clusters = kmeans.fit_predict(X_scaled)
    silhouette_avg = silhouette_score(X_scaled, clusters)
    silhouette_scores.append(silhouette_avg)

plt.figure(figsize=(10, 6))
plt.plot(k_range[1:], silhouette_scores, marker='o')
plt.title('Silhouette Scores for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(k_range[1:])
plt.grid(True)
plt.show()


Cluster Profiling and Insights
Task: Analyze the characteristics of the clusters formed in the Mall Customers dataset. Provide insights into the customer segments based on their spending behavior and income levels.


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('Mall_Customers.csv')

X = data[['Annual Income (k$)', 'Spending Score (1-100)']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=5, n_init=10, random_state=42)
data['Cluster'] = kmeans.fit_predict(X_scaled)

cluster_profile = data.groupby('Cluster').mean()

cluster_summary = data.groupby('Cluster').agg({
    'Annual Income (k$)': ['mean', 'std'],
    'Spending Score (1-100)': ['mean', 'std'],
    'CustomerID': 'count'
}).reset_index()

cluster_summary.columns = ['Cluster', 'Income Mean', 'Income Std', 'Score Mean', 'Score Std', 'Number of Customers']

print(cluster_summary)

plt.figure(figsize=(14, 7))

plt.subplot(1, 2, 1)
sns.scatterplot(data=data, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', palette='tab10', s=100)
plt.title('Customer Segments by Annual Income and Spending Score')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')

plt.subplot(1, 2, 2)
cluster_summary.set_index('Cluster').plot(kind='bar', figsize=(14, 7), rot=0)
plt.title('Cluster Profiles')
plt.xlabel('Cluster')
plt.ylabel('Mean and Std')
plt.grid(True)

plt.tight_layout()
plt.show()


Hierarchical Clustering for Customer Segmentation
Task: Implement hierarchical clustering on the Mall Customers dataset. Compare the clusters formed with those obtained from K-Means and discuss the differences.


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

data = pd.read_csv('Mall_Customers.csv')

X = data[['Annual Income (k$)', 'Spending Score (1-100)']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

linkage_matrix = linkage(X_scaled, method='ward')

plt.figure(figsize=(12, 8))
dendrogram(linkage_matrix,
           truncate_mode='lastp',
           p=12,  # Adjust based on your preference
           show_leaf_counts=True,
           show_contracted=True)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Number of points in node')
plt.ylabel('Distance')
plt.show()

num_clusters = 5  
hierarchical_clusters = fcluster(linkage_matrix, num_clusters, criterion='maxclust')

data['Hierarchical_Cluster'] = hierarchical_clusters


kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
data['KMeans_Cluster'] = kmeans.fit_predict(X_scaled)

kmeans_summary = data.groupby('KMeans_Cluster').agg({
    'Annual Income (k$)': ['mean', 'std'],
    'Spending Score (1-100)': ['mean', 'std'],
    'CustomerID': 'count'
}).reset_index()

hierarchical_summary = data.groupby('Hierarchical_Cluster').agg({
    'Annual Income (k$)': ['mean', 'std'],
    'Spending Score (1-100)': ['mean', 'std'],
    'CustomerID': 'count'
}).reset_index()

kmeans_summary.columns = ['Cluster', 'Income Mean', 'Income Std', 'Score Mean', 'Score Std', 'Number of Customers']
hierarchical_summary.columns = ['Cluster', 'Income Mean', 'Income Std', 'Score Mean', 'Score Std', 'Number of Customers']

print("K-Means Clustering Summary:")
print(kmeans_summary)
print("\nHierarchical Clustering Summary:")
print(hierarchical_summary)

plt.figure(figsize=(14, 7))

plt.subplot(1, 2, 1)
sns.scatterplot(data=data, x='Annual Income (k$)', y='Spending Score (1-100)', hue='KMeans_Cluster', palette='tab10', s=100)
plt.title('K-Means Clustering')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')

# Plot Hierarchical Clusters
plt.subplot(1, 2, 2)
sns.scatterplot(data=data, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Hierarchical_Cluster', palette='tab10', s=100)
plt.title('Hierarchical Clustering')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')

plt.tight_layout()
plt.show()


Visualizing Clusters with PCA
Task: Apply PCA to the Mall Customers dataset to reduce its dimensionality. Visualize the clusters from both K-Means and hierarchical clustering in the PCA-reduced space.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, fcluster

data = pd.read_csv('Mall_Customers.csv')

X = data[['Annual Income (k$)', 'Spending Score (1-100)']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

data['PCA1'] = X_pca[:, 0]
data['PCA2'] = X_pca[:, 1]

kmeans = KMeans(n_clusters=5, n_init=10, random_state=42)
data['KMeans_Cluster'] = kmeans.fit_predict(X_scaled)

linkage_matrix = linkage(X_scaled, method='ward')
num_clusters = 5
data['Hierarchical_Cluster'] = fcluster(linkage_matrix, num_clusters, criterion='maxclust')

plt.figure(figsize=(14, 7))

plt.subplot(1, 2, 1)
sns.scatterplot(data=data, x='PCA1', y='PCA2', hue='KMeans_Cluster', palette='tab10', s=100)
plt.title('K-Means Clustering in PCA Space')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

plt.subplot(1, 2, 2)
sns.scatterplot(data=data, x='PCA1', y='PCA2', hue='Hierarchical_Cluster', palette='tab10', s=100)
plt.title('Hierarchical Clustering in PCA Space')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')

plt.tight_layout()
plt.show()
