# Customer Segmentation - ML Experimentation

This notebook is for experimenting with the K-Means clustering model.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

In [None]:
# Load dataset
df = pd.read_csv('../data/customers.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## Exploratory Data Analysis

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

## Feature Scaling

In [None]:
# Select features
features = ['Age', 'Annual_Income', 'Spending_Score', 'Purchase_Frequency']
X = df[features].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Features scaled successfully")

## Elbow Method

In [None]:
# Calculate inertia for different k values
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot elbow curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(K_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method')
ax1.grid(True)

ax2.plot(K_range, silhouette_scores, 'ro-')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Score vs K')
ax2.grid(True)

plt.tight_layout()
plt.show()

optimal_k = K_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_k}")

## Train Final Model

In [None]:
# Train with optimal k
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to dataframe
df['Cluster'] = clusters

print(f"Silhouette Score: {silhouette_score(X_scaled, clusters):.3f}")
print(f"Inertia: {kmeans.inertia_:.2f}")

## Cluster Analysis

In [None]:
# Cluster statistics
cluster_stats = df.groupby('Cluster')[features].mean()
cluster_stats['Count'] = df.groupby('Cluster').size()
cluster_stats

## Visualizations

In [None]:
# Scatter plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Age vs Income
axes[0, 0].scatter(df['Age'], df['Annual_Income'], c=df['Cluster'], cmap='viridis', alpha=0.6)
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Annual Income')
axes[0, 0].set_title('Age vs Annual Income')

# Income vs Spending Score
axes[0, 1].scatter(df['Annual_Income'], df['Spending_Score'], c=df['Cluster'], cmap='viridis', alpha=0.6)
axes[0, 1].set_xlabel('Annual Income')
axes[0, 1].set_ylabel('Spending Score')
axes[0, 1].set_title('Income vs Spending Score')

# Age vs Spending Score
axes[1, 0].scatter(df['Age'], df['Spending_Score'], c=df['Cluster'], cmap='viridis', alpha=0.6)
axes[1, 0].set_xlabel('Age')
axes[1, 0].set_ylabel('Spending Score')
axes[1, 0].set_title('Age vs Spending Score')

# Purchase Frequency vs Spending Score
axes[1, 1].scatter(df['Purchase_Frequency'], df['Spending_Score'], c=df['Cluster'], cmap='viridis', alpha=0.6)
axes[1, 1].set_xlabel('Purchase Frequency')
axes[1, 1].set_ylabel('Spending Score')
axes[1, 1].set_title('Purchase Frequency vs Spending Score')

plt.tight_layout()
plt.show()