# Task 2: Customer Segmentation

This notebook implements customer segmentation using K-Means clustering on the Mall Customers dataset.

- Load and explore the Mall Customers dataset
- Perform data preprocessing and scaling
- Apply Elbow Method and Silhouette Analysis for optimal cluster selection
- Implement K-Means clustering
- Visualize clusters and analyze customer segments
- Bonus: DBSCAN clustering comparison
- Generate business insights and recommendations

> Dataset: Mall Customers with Annual Income and Spending Score features.


In [None]:
# Imports and setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")


In [None]:
# Load dataset
df = pd.read_csv('data/Mall_Customers.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()


In [None]:
# Data preprocessing and feature selection
income_cols = ['Annual Income (k$)', 'Annual_Income_(k$)', 'Annual_Income', 'AnnualIncome']
spending_cols = ['Spending Score (1-100)', 'Spending_Score_(1-100)', 'Spending_Score', 'SpendingScore']

income_col = next((col for col in income_cols if col in df.columns), None)
spending_col = next((col for col in spending_cols if col in df.columns), None)

print(f"Using columns: {income_col}, {spending_col}")
X = df[[income_col, spending_col]].copy()

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Features scaled using StandardScaler")


In [None]:
# Find optimal clusters and apply K-Means
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

optimal_k = K_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_k}")

# Apply final clustering
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(X_scaled)
df['Cluster'] = cluster_labels

print(f"Silhouette Score: {silhouette_score(X_scaled, cluster_labels):.3f}")


In [None]:
# Visualize clusters
plt.figure(figsize=(12, 5))

# K-Means clusters
plt.subplot(1, 2, 1)
scatter1 = plt.scatter(X[income_col], X[spending_col], 
                      c=cluster_labels, cmap='viridis', alpha=0.7, s=50)
centroids_original = scaler.inverse_transform(kmeans_final.cluster_centers_)
plt.scatter(centroids_original[:, 0], centroids_original[:, 1],
           c='red', marker='x', s=200, linewidths=3, label='Centroids')
plt.title(f'K-Means Clustering (k={optimal_k})')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.colorbar(scatter1, label='Cluster')

# DBSCAN comparison
plt.subplot(1, 2, 2)
dbscan = DBSCAN(eps=0.6, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)
scatter2 = plt.scatter(X[income_col], X[spending_col], 
                      c=dbscan_labels, cmap='plasma', alpha=0.7, s=50)
plt.title('DBSCAN Clustering')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.colorbar(scatter2, label='Cluster')

plt.tight_layout()
plt.show()

print("Analysis complete! Check the visualizations and cluster characteristics.")
