# Market Segmentation with K-Means and Hierarchical Clustering

In [None]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.utils import resample

# Data placeholder (MD_x) - Replace with your data
# X = np.load('path_to_your_data.npy')  # Example placeholder for loading data
X = np.random.rand(150, 10)  # Simulating random data as an example

# Performing k-means clustering for 2 to 8 segments
k_values = range(2, 9)
inertia_values = []
labels = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=1234)
    kmeans.fit(X)
    inertia_values.append(kmeans.inertia_)
    labels.append(kmeans.labels_)

# Scree plot for sum of within-cluster distances
plt.plot(k_values, inertia_values, marker='o')
plt.xlabel('Number of Segments')
plt.ylabel('Sum of Within-Cluster Distances')
plt.title('Scree Plot for K-means Clustering')
plt.show()


In [None]:

# For stability analysis, perform bootstrapping
def bootstrap_stability(X, n_clusters, n_bootstrap=100):
    ari_scores = []
    for _ in range(n_bootstrap):
        X_sample = resample(X)
        kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=1234)
        labels = kmeans.fit_predict(X_sample)
        # For simplicity using original data's labels
        ari = adjusted_rand_score(kmeans.labels_, labels)
        ari_scores.append(ari)
    return ari_scores

# Example for k=4 clusters
ari_scores = bootstrap_stability(X, n_clusters=4)

# Plot the stability results
plt.boxplot(ari_scores)
plt.xlabel('Cluster Stability')
plt.ylabel('Adjusted Rand Index')
plt.title('Bootstrap Stability for k=4 Clusters')
plt.show()


In [None]:

from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

# Perform hierarchical clustering (Ward method)
Z = linkage(X, 'ward')

# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.title('Dendrogram for Hierarchical Clustering (Ward method)')
plt.show()

# Example: Two-step clustering using agglomerative clustering
agg_clustering = AgglomerativeClustering(n_clusters=3)
agg_labels = agg_clustering.fit_predict(X)

# Display segment distribution (number of data points per cluster)
import pandas as pd
pd.Series(agg_labels).value_counts()
