In [None]:
import pandas as pd
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from pathlib import Path
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA
import seaborn as sns

plt.style.use("ggplot")

In [None]:
processed = Path("data/processed")

customers = pd.read_csv(processed / "unique_customer_orders.csv")
customers.head()

In [None]:
customers.dtypes

In [None]:
sns.scatterplot(customers, x="recency", y="delivery_delay");

In [None]:
sns.scatterplot(customers, x="recency", y="number_of_orders");

In [None]:
customers.head()

In [None]:
# RFM columns
RFM = ["recency","frequency", "amount"]

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(customers[RFM].values)

In [None]:
from sklearn.cluster import KMeans

model = KMeans(n_init="auto", random_state=0)
visualizer = KElbowVisualizer(model, k=(4,40), timings=False)

In [None]:
visualizer.fit(X_scaled)
visualizer.poof();

In [None]:
model = KMeans(14, n_init="auto", random_state=0)

In [None]:
visualizer = SilhouetteVisualizer(model)

visualizer.fit(X_scaled)
visualizer.poof();

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
X_pca.shape

In [None]:
model.fit(X_scaled)
plt.figure(figsize=(16,9))
pred_labels = model.predict(X_scaled)
sns.scatterplot(x=X_pca.T[0], y=X_pca.T[1], alpha=1, hue=pred_labels)

In [None]:

px.scatter_3d(x=X_scaled[:,0], y=X_scaled[:,1], z=X_scaled[:,2], color=pred_labels, width=1500, height=1000, opacity=0.1)

In [None]:
import math
import numpy as np

watch_params = ["gamma", "alpha", "kernel"]

def view_clustering(projectors, X, hue, p=1.):
    n = int(X.shape[0] * p)

    X, hue = X[:n], hue[:n]

    max_cols = 3
    lines = math.ceil(len(projectors) / max_cols)

    fig, axs = plt.subplots(lines, min(len(projectors), max_cols), figsize=(24, lines*10))
    fig.suptitle(f"All model evaluations from {p*100}% of the dataset ({X.shape[0]} datapoints)")

    for model, ax in zip(projectors, np.array(axs).flatten()):
        title = ""
        for k, v in model.get_params().items():
            if k in watch_params:
                if title == "":
                    title = "- ("
                title += f"{k}:{v} "
        if title != "":
            title = " " + title.strip() + ")"
        ax.set_title(type(model).__name__ + title)
        X_out = model.fit_transform(X)
        sns.scatterplot(x=X_out.T[0], y=X_out.T[1], hue=hue, ax=ax)

In [None]:
from sklearn.decomposition import KernelPCA

projectors = [
    PCA(n_components=2),
    *[KernelPCA(n_components=2, kernel="rbf", gamma=g, n_jobs=-1) for g in np.logspace(-2,2,num=5)]
]

view_clustering(projectors, X_scaled, hue=pred_labels, p=0.1)

On peut maintenant utiliser cette fonction pour visualiser de différentes façons nos clusters (qui ne sont pas encore évidents).

On peut maintenant ajouter d'autres données à notre KMeans puis analyser les prédictions

In [None]:
customers.head()

In [None]:
def show_elbows(features_list):
    for features in features_list:
        print("Feature visualisations : \n", features)
        X_scaled = StandardScaler().fit_transform(customers[features].values)

        model = KMeans(n_init="auto", random_state=0)
        visualizer = KElbowVisualizer(model, k=(4,25), timings=False)

        visualizer.fit(X_scaled)
        visualizer.poof()

In [None]:
show_elbows([
    [*RFM],
    [*RFM, "delivery_delay"],
    [*RFM, "estimation_error"],
    [*RFM, "number_of_orders"],
    [*RFM, "respected_ratio"],
    [*RFM, "lat"],
    [*RFM, "lng"],
    [*RFM, "lat", "lng"]
])