In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [None]:
df_original = pd.read_csv('customer_segmentation_data.csv')
df_original.head()

FileNotFoundError: [Errno 2] No such file or directory: 'customer_segmentation_data.csv'

In [None]:
df_dropped = df_original.drop(columns='id')
df_dropped

In [None]:
df_dropped.columns

In [None]:
df_dropped.isnull().sum()

In [None]:
df_dropped.duplicated().sum()

In [None]:
# One Hot Encoder

enc = OneHotEncoder(handle_unknown="ignore", drop=None, sparse_output=False)
X = enc.fit_transform(df_dropped[["gender",'preferred_category']])
cols = enc.get_feature_names_out(["gender",'preferred_category'])

# Put back into your DataFrame

df = df_dropped.join(pd.DataFrame(X, columns=cols, index=df.index).astype(int))
df = df.drop(columns=['gender','preferred_category'])
df


In [None]:
# df_dummy = pd.get_dummies(df_dropped, columns=['gender','preferred_category'], drop_first=True, dtype=int)
# df_dummy

In [None]:
# Correlation, Group Income to bins.
df.corr()
# EDA

In [None]:
#numeric_cols = ['age','income','spending_score','membership_years','purchase_frequency','last_purchase_amount']
#cat_cols = ['gender_Male', 'gender_Other','preferred_category_Electronics', 'preferred_category_Groceries','preferred_category_Home & Garden', 'preferred_category_Sports']

numeric_cols = ['income','spending_score']#,'age','membership_years','purchase_frequency','last_purchase_amount']
cat_cols = ['gender_Male', 'gender_Other','gender_female', 'preferred_category_Electronics', 'preferred_category_Groceries','preferred_category_Home & Garden', 'preferred_category_Sports','preferred_category_Clothing']

df_scaled = df.copy()

scaler = StandardScaler()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])
# Maybe also scale membership_years, purchase_frequency, last_purchase_amount


In [None]:
df_scaled

In [None]:
X_scaled = df_scaled[numeric_cols]

# K-Means

In [None]:
wcss = []

for i in range(1,11):
  kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
  kmeans.fit(X_scaled)
  wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


In [None]:
# Clusters seems to be the best (k=4)

kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=500, random_state=42)
y = kmeans.fit_predict(X_scaled)
labels_kmeans = kmeans.labels_
labels_kmeans

In [None]:
score = silhouette_score(X_scaled, labels_kmeans)
print(score)

In [None]:
# Add a new column with clusters to my DataFrame
labels_kmeans = kmeans.labels_

df_scaled['Clusters'] = labels_kmeans
df_scaled.head(5)

In [None]:
## Visualize Summary Statistics of Clusters
# Frequency of Clusters
sns.countplot(data=df_scaled, x='Clusters')

In [None]:
# Cluster vs mean spending_score
#sns.barplot(data=df_scaled, x='Clusters', y='spending_score', errorbar=None, estimator='mean')

In [None]:
# Cluster vs Mean Income
#sns.barplot(data=df_scaled, x='Clusters', y='income', errorbar=None, estimator='mean')

In [None]:
# Getting numerical summary of clusters
df_scaled.astype(float)
df_scaled.groupby('Clusters').mean().round(2)

- Cluster 0 - low income, low spend (−0.93, −0.92)
- Cluster 1 - high income, low spend (+0.82, −0.83)
- Cluster 2 - low income, high spend (−0.79, +0.83)
- Cluster 3 - high income, high spend (+0.94, +0.93)

In [None]:
# Next Step: Visualize observations of that particular cluster
# Cluster 1
#df_scaled.groupby()

In [None]:
df_original['Kmeans_Cluster'] = y
df_original

In [None]:
df_original = df_original.drop(columns='HC_Cluster')
df_original.to_csv("Kmeans_Clusters.csv", index=False, sep=";")

In [None]:
# Scatter plot of clusters
plt.figure(figsize=(8,6))
plt.scatter(X_scaled['income'], X_scaled['spending_score'],
            c=labels_kmeans,
            cmap='viridis',
            s=50)

# Plot centroids
plt.scatter(kmeans.cluster_centers_[:, 0],
            kmeans.cluster_centers_[:, 1],
            c='red',
            s=200,
            alpha=0.7,
            marker='X',
            label='Centroids')

plt.title('K-Means Clustering (Income vs Spending Score)')
plt.xlabel('Income')
plt.ylabel('Spending Score')
plt.legend()
plt.show()

# Hierarchical Clustering

In [None]:
import scipy.cluster.hierarchy as sch

dendrogram = sch.dendrogram(sch.linkage(X_scaled, method = 'ward'))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, metric = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(X_scaled)


In [None]:
df = df_scaled.copy()
df = df.drop(columns=['Clusters'])
df['HC_Cluster'] = pd.Series(y_hc, index=df.index).astype(int)  # one new column only
df

In [None]:
# Summary Statistics of this cluster
df.astype(float)
df.groupby('HC_Cluster').mean().round(2)

- Cluster 0 - low income, high spend (−0.82, +0.80)
- Cluster 1 - average income, low spend (+0.22, −0.69)
- Cluster 2 - high income, high spend (+0.99, +0.92)
- Cluster 3 - low income, low spend (-1.01, -1.15)
- Cluster 4 - high income, low spend (+1.32, -0.82)

Focus on Cluster 4, keep up with Cluster 2, and give more attention to cluster 1.

In [None]:
# Visualize the observations of a particular cluster = 2
df2 = df[df['HC_Cluster'].eq(2)].copy()
df2


In [None]:
# Count of each category preferred, gender etc

preferred_categories = ['preferred_category_Clothing',
       'preferred_category_Electronics', 'preferred_category_Groceries',
       'preferred_category_Home & Garden', 'preferred_category_Sports']

for i in preferred_categories:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df2, x=i, palette='pastel')
    plt.title(f'Distribution of {i}', fontsize=12)
    plt.xlabel(i)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize for cluster = 4 (High Income, Low Spend)
# Think of strategies where we can increase the purchase rate
df4 = df[df['HC_Cluster'].eq(4)].copy()
df4

# Visualization on Preferred category

for i in preferred_categories:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df4, x=i, palette='pastel')
    plt.title(f'Distribution of {i}', fontsize=12)
    plt.xlabel(i)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Calculate cluster centroids (mean position per cluster)

# dataframe to array
x1 = X_scaled.iloc[:, 0].to_numpy()
x2 = X_scaled.iloc[:, 1].to_numpy()

centroids = []
for cluster_id in np.unique(y_hc):
    mask = y_hc == cluster_id
    cx, cy = x1[mask].mean(), x2[mask].mean()
    centroids.append((cluster_id, cx, cy))

cluster_labels = {
    0: "Low income, high spend",
    1: "Avg income, low spend",
    2: "High income, high spend",
    3: "Low income, low spend",
    4: "High income, low spend"
}


plt.figure(figsize=(8,6))
plt.scatter(x1, x2, c=y_hc, cmap='rainbow', s=50)
plt.title('Agglomerative Clustering (2D)')
plt.xlabel(X_scaled.columns[0])
plt.ylabel(X_scaled.columns[1])

# Add centroid labels
for cluster_id, cx, cy in centroids:
    plt.scatter(cx, cy, s=200, c='black', marker='x')  # centroid marker
    plt.text(cx, cy, cluster_labels[cluster_id], fontsize=10, ha='center', va='bottom', color='black', weight='bold')

plt.show()

In [None]:
# A/B Test
# F-Test

In [None]:
# Expose two different groups, Cluster 1 or CLuster 2 to the same market offer (Subscription) and see if there's any difference
# Or controlled experiment for a

In [None]:
df_original['HC_Cluster'] = y_hc
df_original

In [None]:
df_original.to_csv("HC_Clusters.csv", index=False, sep=";")

In [None]:


# Create pivot table
pivot_1 = df_original.pivot_table(
    index='HC_Cluster',
    values=['age', 'income', 'spending_score', 'membership_years',
            'purchase_frequency', 'last_purchase_amount','gender', 'preferred_category'] ,
    aggfunc={
        'age': 'mean',
        'income': 'mean',
        'spending_score': 'mean',
        'membership_years': 'mean',
        'purchase_frequency': 'mean',
        'last_purchase_amount': 'mean',
        'gender':'count',
        'preferred_category':'count'}  # count for the last columns
    }
)

pivot_1

In [None]:
# Create pivot table
pivot_2 = df_original.pivot_table(
    index='Kmeans_Cluster',
    values=['age', 'income', 'spending_score', 'membership_years',
            'purchase_frequency', 'last_purchase_amount','gender', 'preferred_category'] ,
    aggfunc={
        'age': 'mean',
        'income': 'mean',
        'spending_score': 'mean',
        'membership_years': 'mean',
        'purchase_frequency': 'mean',
        'last_purchase_amount': 'mean',
        'gender':'count',
        'preferred_category':'count'}  # count for the last columns
    }
)

pivot_2