# Environment setup and dataset loading

In [None]:
import numpy as np
import pandas as pd
import datetime
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt, numpy as np
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import ListedColormap
from sklearn import metrics
import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
np.random.seed(42)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


: 

Note: update the file path depending on where your dataset is. 

In [None]:
df = pd.read_csv("/kaggle/input/customer-behavior-analytcis/customer_behavior_analytcis.csv")

## Understanding Dataset

In [None]:
df.shape

In [None]:
df.head()

In [None]:
for col in df.columns:
    print(df[col].value_counts())
    

# Data Cleaning

In [None]:
df.info()

**From the above output, we can conclude and note that:**

- There are missing values in total_purchases, avg_cart_value, and product_click.
- customer_id (dtype: object) is not parsed as int/ float.
- So we will need to format customer_id into numeric form.

In [None]:
for column in df.columns:
    if df[column].isnull().any():
        df[column].fillna(df[column].median(), inplace=True)

# Data preprocessing

In [None]:
df['customer_id'] = df['customer_id'].str.extract('(\d+)').astype(int)

In [None]:
df.describe()

In [None]:
ds = df.copy()
scaler = StandardScaler()
scaler.fit(ds)
scaled_ds = pd.DataFrame(scaler.transform(ds),columns= ds.columns )
print("All features are now scaled")

In [None]:
scaled_ds.describe()

Note\
discunt counts seems to be spread away from means in contrast to other features. max > 3 and min<-3.

## Dimension Reduction

### PCA

In [None]:
pca = PCA(n_components=3)
pca.fit(scaled_ds)
PCA_ds = pd.DataFrame(pca.transform(scaled_ds), columns=(["col1","col2", "col3"]))
PCA_ds.describe().T

In [None]:
x =PCA_ds["col1"]
y =PCA_ds["col2"]
z =PCA_ds["col3"]
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x,y,z, c="blue", marker="o" )
ax.set_title("A 3D Projection Of Data In The Reduced Dimension")
plt.show()

shows a discrimination of data into three clusters

# EDA

In [None]:
df.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)

no normal distribution seen in figures

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

columns_to_plot = df.columns

num_cols = 5
num_rows = int(np.ceil(len(columns_to_plot) / num_cols))

fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 4 * num_rows)) 

axes = axes.flatten()

for i, column in enumerate(columns_to_plot):
    sns.boxplot(y=df[column], ax=axes[i])
    axes[i].set_title(f"Boxplot - {column}")
    axes[i].grid(False)

plt.tight_layout()
plt.show()

discount count has outliers as shown box plot - discount_counts

In [None]:
sns.set(rc={"axes.facecolor":"#FFF9ED","figure.facecolor":"#FFF9ED"})
pallet = ["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"]
cmap = colors.ListedColormap(["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"])
To_Plot = ["total_purchases","avg_cart_value", "total_time_spent", "product_click", "discount_counts", "customer_id"]
print("Reletive Plot Of Features")
plt.figure()
sns.pairplot(scaled_ds[To_Plot],palette= (["#682F2F","#F3AB60"]))
plt.show()

product click seems to have outliers\
discount counts seems to have outliers

average cart value, total purchases pair clearly shows a clustering the dataset to 3 categories\
average cart value, product click pair clearly shows a clustering the dataset to 3 categories\
average cart value, total time spent pair clearly shows a clustering the dataset to 3 categories

In [None]:
corrmat= scaled_ds.corr()
plt.figure(figsize=(20,20))  
sns.heatmap(corrmat,annot=True, cmap="coolwarm", center=0)

Note\
total purchases have strong positive linear relationship with discount counts\
total time spend have strong positive linear relationship with product click\
total time spend have strong negative linear relationship with total purchases


# Model Selection

## Clustering

Steps involved in the Clustering

- Elbow Method to ensure the number of clusters to be formed
- Clustering via different models
- Examining the clusters formed via scatter plot


In [None]:
Elbow_M = KElbowVisualizer(KMeans(), k=10)
Elbow_M.fit(PCA_ds)
Elbow_M.show()

Above cell ensures that there are 3 clusters

In [None]:
data = df.copy()

## Mean Shift Clustering

In [None]:
from sklearn.cluster import MeanShift

mean_shift = MeanShift()
yhat_MeanShift = mean_shift.fit_predict(PCA_ds)

data['Clusters_MeanShift'] = yhat_MeanShift


fig = plt.figure(figsize=(10,8))
ax = plt.subplot(111, projection='3d', label="bla")
ax.scatter(x, y, z, s=40, c=data['Clusters_MeanShift'], marker='o', cmap = cmap )
ax.set_title("The Plot Of The Clusters")
plt.show()


from sklearn.metrics import silhouette_score

score = silhouette_score(PCA_ds, data['Clusters_MeanShift'])
print(f'Silhouette Score: {score}')

## Gaussian Mixture Model (GMM)

In [None]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=3, random_state=42)
yhat_GMM = gmm.fit_predict(PCA_ds)

data['Clusters_GMM'] = yhat_GMM


fig = plt.figure(figsize=(10,8))
ax = plt.subplot(111, projection='3d', label="bla")
ax.scatter(x, y, z, s=40, c=data['Clusters_GMM'], marker='o', cmap = cmap )
ax.set_title("The Plot Of The Clusters")
plt.show()


from sklearn.metrics import silhouette_score

score = silhouette_score(PCA_ds, data['Clusters_GMM'])
print(f'Silhouette Score: {score}')

## K Means Clustering

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
PCA_ds["Clusters"] = df['Clusters'] = kmeans.fit_predict(PCA_ds)

fig = plt.figure(figsize=(10,8))
ax = plt.subplot(111, projection='3d', label="bla")
ax.scatter(x, y, z, s=40, c=PCA_ds["Clusters"], marker='o', cmap = cmap )
ax.set_title("The Plot Of The Clusters")
plt.show()

from sklearn.metrics import silhouette_score

score = silhouette_score(PCA_ds, df['Clusters'])
print(f'Silhouette Score: {score}')

## Agglomerative Clustering model

In [None]:
AC = AgglomerativeClustering(n_clusters=3)
yhat_AC = AC.fit_predict(PCA_ds)
PCA_ds["Clusters"] = yhat_AC
df["Clusters"]= yhat_AC

fig = plt.figure(figsize=(10,8))
ax = plt.subplot(111, projection='3d', label="bla")
ax.scatter(x, y, z, s=40, c=PCA_ds["Clusters"], marker='o', cmap = cmap )
ax.set_title("The Plot Of The Clusters")
plt.show()


from sklearn.metrics import silhouette_score

score = silhouette_score(PCA_ds, df['Clusters'])
print(f'Silhouette Score: {score}')

Selected Model -  Agglomerative Clustering

Agglomerative Clustering was chosen for its ability to identify customer segments of varying shapes and densities, without assuming spherical clusters like K-Means. Its hierarchical nature, visualized through a dendrogram, provides valuable insights into the relationships between clusters and supports interpretable segmentation strategies. Given the potential for non-linear relationships in customer behavior, this flexibility makes Agglomerative Clustering a robust approach for identifying distinct customer groups within the e-commerce platform data.




# Model Evaluation

In [None]:
pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]
pl = sns.countplot(x=df["Clusters"], palette= pal)
pl.set_title("Distribution Of The Clusters")
plt.show()

The clusters seem to be fairly distributed.

In [None]:
pl = sns.scatterplot(data = df,x=df["total_time_spent"], y=df["total_purchases"],hue=df["Clusters"], palette= pal)
pl.set_title("Cluster's Profile Based On total_time_spent and total_purchases")
plt.legend()
plt.show()

total_time_spent vs  total_purchases shows the clusters pattern

cluster 0: high total_time_spent & low total_purchases\
cluster 1: low total_time_spent & high total_purchases\
cluster 2: average total_time_spent & average total_purchases


In [None]:
plt.figure()
pl=sns.swarmplot(x=df["Clusters"], y=df["total_purchases"], color= "#CBEDDD", alpha=0.5 )
pl=sns.boxenplot(x=df["Clusters"], y=df["total_purchases"], palette=pal)
pl.set_title("Total Purchases")
plt.show()

From the above plot, it can be clearly seen that cluster 1 is our biggest set of customers followed by cluster 2.

In [None]:
plt.figure()
pl=sns.boxenplot(y=df["discount_counts"],x=df["Clusters"], palette= pal)
pl.set_title("Discount Counts")
plt.show()

cluster 1 bargain hunters claims the highest discount counts.

In [None]:
plt.figure()
pl=sns.boxenplot(y=df["product_click"],x=df["Clusters"], palette= pal)
pl.set_title("Number of Product Clicks")
plt.show()

cluster 0; window shoppers view a large number of products whilst others have moderate product clicks.

In [None]:
plt.figure()
pl=sns.boxenplot(y=df["avg_cart_value"],x=df["Clusters"], palette= pal)
pl.set_title("Average Cart Values")
plt.show()

cluster 2 can be identified as high spenders have high average cart values as mentioned in the question confirming our clustering accuracy. Other two groups have moderate average cart values.

# Conclusion

## Identifying Clusters

Clusters follows the patterns given in the question\
cluster 0: Window Shoppers \
cluster 1: Bargain Hunters\
cluster 2: High Spenders