import libraries

In [None]:
import pandas as pd                                                 # data processing    
from tqdm import tqdm                                               # progress bar
from sklearn.preprocessing import OneHotEncoder                     # features normalization
import numpy as np

from kmodes.kprototypes import KPrototypes
from sklearn.cluster import KMeans                                  # clustering
from sklearn.cluster import MiniBatchKMeans                         # clustering low-cost
from sklearn.metrics import silhouette_score as ss 
from sklearn.metrics import silhouette_samples                      # model config
from yellowbrick.cluster import SilhouetteVisualizer                # model metrics viz

from sklearn.preprocessing import MinMaxScaler, StandardScaler      # features normalization
from sklearn.compose import ColumnTransformer                       # features normlaization
from sklearn.decomposition import PCA                               # dimension reduction

import matplotlib.pyplot as plt                                     # data viz
import seaborn as sns                                               # data viz    
from yellowbrick.cluster import KElbowVisualizer                    # model config
import pandasql as ps
import pickle
import pprint
from uuid import uuid4

%matplotlib inline

In [None]:
key = uuid4()
key

notebook configuration

In [None]:
config = {
    # nb found by Elbow technique
    "n_clusters": 6,
    # repro
    "random_state":29,
    # max iteration per run
    "max_iter": 200,
    # run number
    "n_init": 10,
    
    # pca config - level of variance keeped by algorithm
    "n_components": 2,
    # save both model and pca_model
    "save_model": True,
    # load sample data
    "sample": True,
    # run KElbowVisualizer
    "find_k": False,
    # exploratory plots
    "plot": True,
    # run id
    "run_id": key
}

get data - full csv or 0.05% sample.


In [None]:
data_fn = "sample_KaDo.csv" if config.get("sample") else "Kado.csv"
df = pd.read_csv(f"../data/{data_fn}")
df.shape

## Data cleaning

- check presence of na/null values
- get rid of "FAMILY" underrepresented values
- replace product prices by median values

In [None]:
mask = ~df["FAMILLE"].isin(["MULTI FAMILLES","SANTE NATURELLE"])
df = df[mask]

In [None]:
if config.get("plot"):
    df['PRIX_NET'].nlargest(50).plot(kind="bar")
    plt.xlabel('Product')
    plt.ylabel('Price')
    plt.title('50 most expensive products')
    plt.tick_params(
        axis='x',
        which='both',   
        bottom=False,   
        labelbottom=False)

    plt.show()

In [None]:
if config.get("plot"):
    df['PRIX_NET'].nsmallest(50).plot(kind="bar")
    plt.xlabel('Product')
    plt.ylabel('Price')
    plt.title('50 cheapest products')
    plt.tick_params(
        axis='x',          
        which='both',      
        bottom=False,     
        labelbottom=False)

    # Show the plot
    plt.show()

In [None]:
df['PRIX_NET'] = df.groupby('LIBELLE')['PRIX_NET'].transform('median')

In [None]:
# if config.get("plot"):
#     df['PRIX_NET'].nlargest(50).plot(kind="bar")
#     plt.xlabel('Product')
#     plt.ylabel('Median prices')
#     plt.title('50 most expensive products with median prices')
#     plt.tick_params(
#         axis='x',          
#         which='both',     
#         bottom=False,     
#         labelbottom=False)

#     plt.show()

In [None]:
# if config.get("plot"):
#     df['PRIX_NET'].nsmallest(50).plot(kind="bar")
#     plt.xlabel('Product')
#     plt.ylabel('Median prices')
#     plt.title('50 cheapest products with median prices')
#     plt.tick_params(
#         axis='x',
#         which='both',
#         bottom=False,
#         labelbottom=False)

#     plt.show()

In [None]:
print(f"""
Number of client: {'{:,}'.format(df.CLI_ID.nunique())}
Number of transactions: { '{:,}'.format(df.TICKET_ID.nunique())}
Number of product: {len(df)}
Shape: { '{:,}'.format(df.shape[0])} - {df.shape[1]}
""")

## data extraction

Features are extracted from dataset to build the model. 
We are using the % of money spent by each customer by family, since this is the group with very low cardinality.

In [None]:
# Group the data by CLI_ID and FAMILLE
df_grouped = df.groupby(['CLI_ID', 'FAMILLE'])

# Calculate the total sales for each group
df_sales = df_grouped['PRIX_NET'].sum().reset_index()

# Pivot the data to get the sales for each FAMILLE for each CLI_ID
df_pivot = df_sales.pivot(index='CLI_ID', columns='FAMILLE', values='PRIX_NET')

# Calculate the percentage of sales for each FAMILLE for each CLI_ID
df_pct = df_pivot.div(df_pivot.sum(axis=1), axis=0).mul(100)

# fill na value with zero
df_pct.fillna(0, inplace=True)

# rename columns
df_pct.columns = [f"% {col_name}" for col_name in  df_pct.columns]
df_pct

## normalization

no need since we have all value on same scale (%).

In [None]:
df_features = df_pct.copy()

## Clustering

The idea is to find the right number of cluster in our dataset.  
Then train the clustering model and analyze it.  
We'll use the silhouette score as metric.

In [None]:
# KElbowVisualizer package allow us to find optimal number of clusters

if config.get('find_k'):
    model = MiniBatchKMeans(
        n_init=config.get("n_init") or 10,
        random_state=config.get("random_state") or 125,
        max_iter=config.get("max_iter") or 100,
    )

    visualizer = KElbowVisualizer(model, k=(4,9), metric="silhouette")
    visualizer.fit(df_features)
    visualizer.show()

with the elbow method we now have a strong hint for the number of clusters


In [None]:
kmeanss = MiniBatchKMeans(
    n_clusters=config.get("n_clusters") or 6, 
    n_init=config.get("n_init") or 10,
    random_state=config.get("random_state") or 125,
    max_iter=config.get("max_iter") or 100,
)
    


# silhouette score visualisation
visualizer = SilhouetteVisualizer(kmeanss, colors='yellowbrick')
visualizer.fit(df_features)
if config.get("save_model"):
    visualizer.show(outpath=f"silhouette_score_clustering_{config.get('run_id')}.png")
else:
    visualizer.show()
    
labels = kmeanss.fit_predict(df_features)
df_features['Cluster'] = pd.Series(labels, index=df_features.index)

In [None]:
# save model
filename = "sample_clustering_model"  if config.get("sample") else "clustering_model"
filename = f"{filename}_{config.get('n_clusters')}_{config.get('random_state')}"


if config.get("save_model"):
    with open(f"{filename}_{config.get('run_id')}.pkl", "wb") as f:
        pickle.dump(kmeanss, f)


In [None]:

df_features

In [None]:
if config.get("plot"):
    sns.countplot(x=df_features["Cluster"]).set(title="clusters distribution")

## Dimension reduction

Since we have a lot of dimensions, we apply PCA in order to visualize the datapoints and clusters in a human interpretable way.

In [None]:
# problem during PCA and kmeans pca
# sklearn: Coordinates of cluster centers. If the algorithm stops before fully converging (see tol and max_iter), these will not be consistent with labels_.
# - 'nb_component' controls the variance of result - low value make obviously cleaner clusters
# 1- compute eignvalues to know good pca 'nb_component'
# 2- use generated value for pca

# kmeans_pca.cluster_centers_ 

# PCA DEBUG
# from sklearn.decomposition import PCA
# import numpy as np

# # Create an instance of PCA
# pca = PCA()

# # Fit the PCA model to your data
# pca.fit(transactions_data)

# # Get the explained variance ratio for each component
# explained_variance = pca.explained_variance_ratio_

# # Select the number of components that retain 95% of the variance
# n_components = np.argmax(np.cumsum(explained_variance) >= 0.95) + 1

# # Set the n_components parameter of PCA to the selected number
# pca = PCA(n_components=n_components)

# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt

# # Create an instance of PCA
# pca = PCA()

# # Fit the PCA model to your data
# pca.fit(transactions_data)

# # Get the explained variance ratio for each component
# explained_variance = pca.explained_variance_ratio_

# # Plot the explained variance ratio against the number of components
# plt.plot(range(1, len(explained_variance)+1), explained_variance)
# plt.xlabel('Number of components')
# plt.ylabel('Explained variance ratio')
# plt.show()

# # Select the number of components where the explained variance ratio starts to decrease
# n_components = int(input("Enter the number of components: "))

# # Set the n_components parameter of PCA to the selected number
# pca = PCA(n_components=n_components)

In [None]:
scaler = MinMaxScaler()
data_rescaled = scaler.fit_transform(df_features.drop("Cluster", axis=1))
reduced_data = PCA(n_components=config.get("n_components") or 0.55).fit_transform(data_rescaled)
kmeans_pca = MiniBatchKMeans(
    n_clusters=config.get("n_clusters") or 6, 
    n_init=config.get("n_init") or 10,
    random_state=config.get("random_state") or 125,
    max_iter=config.get("max_iter") or 100,
)
labels_pca = kmeans_pca.fit_predict(reduced_data)

In [None]:
filename = "sample_pca_clustering_model"  if config.get("sample") else "pca_clustering_model"
filename = f"{filename}_{config.get('n_clusters')}_{config.get('random_state')}_{config.get('run_id')}"

if config.get("save_model"):
    with open(f"{filename}.pkl", "wb") as f:
        pickle.dump(kmeans_pca, f)


In [None]:
centroids = kmeans_pca.cluster_centers_
u_labels = np.unique(labels_pca)
  
for i in u_labels:
    plt.scatter(reduced_data[labels_pca == i , 0] , reduced_data[labels_pca == i , 1] , label = i)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'k')
plt.legend()
plt.show()

In [None]:
print(f"""
{kmeanss.inertia_}
{kmeans_pca.inertia_}
""")


# analyser 
 https://towardsdatascience.com/common-mistakes-in-cluster-analysis-and-how-to-avoid-them-eb960116d773   
 One fast and simple solution is to calculate the mean values of each feature per cluster (tbl. 2).



In [None]:
# checkpoint
# df_features.to_csv("../data/df_labelled.csv")
df_merged = df.merge(df_features.reset_index()[["Cluster", "CLI_ID"]], on="CLI_ID", how="left")
df_merged.to_csv(f"../data/df_labelled_final_{config.get('run_id')}.csv", index=False)
df_merged

In [None]:
df_test = df_features.copy()

In [None]:
capilaire = df_test.groupby("Cluster")['% CAPILLAIRES'].mean()
hygiene = df_test.groupby("Cluster")['% HYGIENE'].mean()
maqu = df_test.groupby("Cluster")['% MAQUILLAGE'].mean()
parfum = df_test.groupby("Cluster")['% PARFUMAGE'].mean()
soins_corps = df_test.groupby("Cluster")['% SOINS DU CORPS'].mean()
soins_vis = df_test.groupby("Cluster")['% SOINS DU VISAGE'].mean()
solaire = df_test.groupby("Cluster")['% SOLAIRES'].mean()


In [None]:
edgecolor = "red"
colors= ['black', 'red', 'green', 'blue', 'cyan', "yellow"]

fig, axs = plt.subplots(4, 2,  figsize=(15, 15))
fig.tight_layout(pad=3.0)


axs[0,0].bar(capilaire.index, capilaire.values, color=colors, edgecolor=edgecolor)
axs[0,0].set_title("mean value of % Capillaire by cluster")
axs[0,0].set(xlabel='cluster')

axs[0,1].bar(hygiene.index, hygiene.values, color=colors, edgecolor=edgecolor)
axs[0,1].set_title("mean value of % Hygiene by cluster")
axs[0,1].set(xlabel='cluster')

axs[1,0].bar(maqu.index, maqu.values, color=colors, edgecolor=edgecolor)
axs[1,0].set_title("mean value of % Maquillage by cluster")
axs[1,0].set(xlabel='cluster')

axs[1,1].bar(parfum.index, parfum.values, color=colors, edgecolor=edgecolor)
axs[1,1].set_title("mean value of % parfum by cluster")
axs[1,1].set(xlabel='cluster')

axs[2,0].bar(soins_corps.index, soins_corps.values, color=colors, edgecolor=edgecolor)
axs[2,0].set_title("mean value of % Soin corps by cluster")
axs[2,0].set(xlabel='cluster')

axs[2,1].bar(soins_vis.index, soins_vis.values, color=colors, edgecolor=edgecolor)
axs[2,1].set_title("mean value of % Soins visage by cluster")
axs[2,1].set(xlabel='cluster')

axs[3,0].bar(solaire.index, solaire.values, color=colors, edgecolor=edgecolor)
axs[3,0].set_title("mean value of % Solaire by cluster")
axs[3,0].set(xlabel='cluster')


fig.delaxes(axs[3,1])


In [None]:
total_sales = pd.concat([df_test, df.groupby("CLI_ID")["PRIX_NET"].mean()], axis=1)

In [None]:
total_by_cluster = total_sales.groupby("Cluster")['PRIX_NET'].sum()
mean_by_cluster = total_sales.groupby("Cluster")['PRIX_NET'].mean()

fig, axes = plt.subplots(1, 2, layout="constrained")

sns.barplot(ax=axes[0], x=total_by_cluster.index, y=total_by_cluster.values).set(title="Total sale by cluster")
sns.barplot(ax=axes[1] ,x=mean_by_cluster.index, y=mean_by_cluster.values).set(title="Mean sale by cluster")