In [None]:
import pandas as pd
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from pathlib import Path
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA, KernelPCA
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder, StandardScaler
import umap
from src.view import view_clusters, view_projection
import numpy as np
import squarify

plt.style.use("ggplot")

In [None]:
processed = Path("data/processed")

customers = pd.read_csv(processed / "unique_customer_orders2.csv")
customers.head()

On va déjà plotter la RFM pour voir s'il y a une différence par rapport à avant

In [None]:
RFM = ["recency", "amount", "number_of_orders"]

In [None]:
customers[RFM]["recency"]

In [None]:
customers_rfm = customers[RFM]


fig = px.scatter_3d(x=customers_rfm[RFM[0]], y=customers_rfm[RFM[1]], z=customers_rfm[RFM[2]], opacity=0.3, color=customers["wealthy"])
fig.update_layout(
    width=1400,
    height=800,
    xaxis_title="AAA",
    scene={f"{a}axis": {"title":{"text":f"{t} ({a})"}} for a,t in zip(["x","y","z"], RFM)}
)

On peut donc essayer de rajouter des features au fur et à mesure parmis celles extraites

On va commencer par scaler et encoder nos données si besoin

In [None]:
encoder = LabelEncoder()

customers["frequent_cat"] = encoder.fit_transform(customers["frequent_cat"])
customers.head()

In [None]:
customers[["amount"]].describe()

In [None]:
customers[["recency"]].describe()

In [None]:
plt.title("Pairplot for some features and RFM")
sns.pairplot(customers[RFM + ["respected_ratio", "estimation_error", "freight_value", "review_score"]])

On peut maintenant refaire notre clusering avec les nouvelles features pour voir si elles aides à identifier des clusters.

Plus tard on s'occupera de regarder les outliers.

In [None]:
cols = ["delivery_delay", "estimation_error", "number_of_orders", "respected_ratio", "lat", "lng", "freight_value", "price", "review_answer_delay", "review_score", "review_level"]

In [None]:
X_scaled = StandardScaler().fit_transform(customers[RFM + cols])
X_scaled.shape

In [None]:
models =[
    PCA(random_state=0),
    KernelPCA(kernel="rbf", random_state=0),
    *[TSNE(perplexity=p, n_jobs=-1, random_state=0) for p in np.logspace(-1, 2, num=4)*3]
]

In [None]:
view_projection(models, X_scaled, hue=customers["wealthy"], p=0.04)

In [None]:
view_projection(models, X_scaled, hue=customers["frequent_cat"], p=0.04)

In [None]:
customers.describe()

In [None]:
X_scaled = StandardScaler().fit_transform(customers[RFM + ["respected_ratio", "freight_value", "price", "review_score", "review_level", "estimation_error", "delivery_delay", "number_of_orders"]])

In [None]:
view_projection(models, X_scaled, hue=customers["wealthy"], p=0.04)

In [None]:
view_projection([*[umap.UMAP(n_neighbors=n, random_state=0) for n in range(2,20, 4)]], X_scaled, hue=customers["wealthy"], p=0.05)

In [None]:
view_projection([TSNE(n_iter=2000, perplexity=30, random_state=0), TSNE(n_iter=2000, perplexity=300, random_state=0)], X_scaled, hue=customers["wealthy"], p=0.05)

In [None]:
favorite_projectors = [
    TSNE(n_iter=1200, perplexity=30, random_state=0, n_jobs=-1),
    umap.UMAP(n_neighbors=16, random_state=0, n_jobs=-1)
]

In [None]:
for projector in favorite_projectors:
    y = projector.fit_transform(X_scaled[:5000])

    visualizer = SilhouetteVisualizer(KMeans(n_init="auto"), k=(4,20), timings=False)
    visualizer.fit(y)
    visualizer.poof()

In [None]:
for projector in favorite_projectors:
    print(type(projector).__name__)

    y = projector.fit_transform(X_scaled[:1000])

    plt.figure()
    visualizer = KElbowVisualizer(KMeans(n_init="auto", random_state=0), k=(4,20), timings=False)
    visualizer.fit(y)
    visualizer.show()

    k = visualizer.elbow_value_

    model = KMeans(n_clusters=k, random_state=0)
    labels = model.fit_predict(y)
    
    fig, ax = plt.subplots()
    sns.scatterplot(x=y.T[0], y=y.T[1], hue=labels, ax=ax)

In [None]:
for projector in favorite_projectors:
    print(type(projector).__name__)

    y = projector.fit_transform(X_scaled)

    plt.figure()
    visualizer = KElbowVisualizer(KMeans(n_init="auto", random_state=0), k=(4,20), timings=False, n_jobs=-1)
    visualizer.fit(y)
    visualizer.show()

    k = visualizer.elbow_value_

    model = KMeans(n_clusters=k, random_state=0)
    labels = model.fit_predict(y)
    
    fig, ax = plt.subplots()
    sns.scatterplot(x=y.T[0], y=y.T[1], hue=labels, ax=ax)

Je vais donc revenir aux bases et faire cela avec le moins de features possible pour rester dans un point de vue métier.

Ici on peut très bien critiquer mon travail en disant qu'il y a trop de features qui brouillent les clients intéressant des non intéressants.

In [None]:
customers.columns

In [None]:
# Voici les colonnes que j'ai choisis
cols = RFM #+ ["respected_ratio", "review_score"]
X = customers[cols]
X_scaled = StandardScaler().fit_transform(X)

In [None]:
for projector in favorite_projectors:
    print(type(projector).__name__)

    y = projector.fit_transform(X_scaled)

    plt.figure()
    visualizer = KElbowVisualizer(KMeans(n_init="auto", random_state=0), k=(2,14), timings=False, n_jobs=-1)
    visualizer.fit(y)
    visualizer.show()

    k = visualizer.elbow_value_

    model = KMeans(n_clusters=k, random_state=0)
    labels = model.fit_predict(y)
    
    fig, ax = plt.subplots()
    sns.scatterplot(x=y.T[0], y=y.T[1], hue=labels, ax=ax)

In [None]:
y = umap.UMAP(n_neighbors=3, n_components=3, random_state=0, n_jobs=-1).fit_transform(X_scaled)

px.scatter_3d(x=y.T[0], y=y.T[1], z=y.T[2], opacity=0.3)

In [None]:
model = KElbowVisualizer(KMeans(random_state=0))
model.fit(X_scaled)
model.show()

In [None]:
model = KMeans(n_clusters=4, n_init="auto")
model.fit(X_scaled)

In [None]:
px.scatter_3d(x=X_scaled.T[0], y=X_scaled.T[1], z=X_scaled.T[2], color=model.labels_, opacity=0.3)

# Itération 2

J'ai l'impression que certaines analyses ne servent à rien. Je vais essayer d'utiliser une méthode RFM que j'ai vu sur Kaggle qui consiste à créer à partir des données une pré-segmentation pour savoir si on doit oui ou non être préocupper par un client ce qui nous permet de connaître et différencier les bons des mauvais clients

In [None]:
customers = pd.read_csv(processed / "unique_customer_orders3.csv")
customers.head()

In [None]:
customers["segment"].describe()

On peut donc refaire les clustering intéressants mais en coloriant par la segmentation

In [None]:
customers["segment"].value_counts()

In [None]:
count = customers["segment"].value_counts()
norm = count / count.sum()
labels = pd.Series(count.index).str.cat(" ( " + (norm * 100).round(2).astype(str).values + "% )")
squarify.plot(count, label=labels, color = ['gold', 'teal', 'steelblue', 'limegreen', 'darkorange', 'coral'])

In [None]:
X_scaled = StandardScaler().fit_transform(customers[RFM])
X_scaled

In [None]:
view_projection(favorite_projectors, X_scaled, hue=customers["segment"])

On remarque que les clients fidèles sont bien éparpillés dans les clusters

C'est déjà pas mal. Voici le résultat d'une étude du meilleur K avec KMeans

In [None]:
model = KMeans(random_state=0, n_init="auto")

visualizer = KElbowVisualizer(model, k=range(2,14), timings=False)
visualizer.fit(X_scaled)
visualizer.show();

In [None]:
customers

In [None]:
for cols in [["frequency"],
             ["frequency", "freight_value"],
             ["frequency", "review_level", "review_score"],
             ["delivery_delay"],
             ["frequency", "review_level", "review_score", "freight_value", "delivery_delay", "respected_ratio"]]:
    X_scaled = StandardScaler().fit_transform(customers[RFM + cols])
    view_projection(favorite_projectors, X_scaled, hue=customers["segment"])

    plt.figure()
    model = KMeans(random_state=0, n_init="auto")
    visualizer = KElbowVisualizer(model, k=range(2,10), timings=False)
    visualizer.fit(X_scaled)
    visualizer.show()

In [None]:
# colonnes sélectionnées
data = customers

cols = ["recency", "amount"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data[cols])

model = KMeans(n_init="auto", n_clusters=4, random_state=0)
model.fit(X_scaled)

# X_umap = favorite_projectors[1].fit_transform(X_scaled)
# plt.figure()
# sns.scatterplot(x=X_umap.T[0],y=X_umap.T[1], hue=data["segment"])
# plt.figure()
# sns.scatterplot(x=X_umap.T[0],y=X_umap.T[1], hue=model.labels_, palette=sns.color_palette("viridis", as_cmap=True))

customers_true_labels = model.labels_

In [None]:
from src.dataset import get_data_until
from sklearn.metrics import adjusted_rand_score
from tqdm import tqdm

In [None]:
scores = []

for m in tqdm(range(1,25)):
    # On perd des clients qu'on avait auparavant
    data_m = get_data_until(month=m)

    # Les clients correspondant dans 'customers'
    customers_m = customers[customers.customer_unique_id.isin(data_m.customer_unique_id)]
    customers_labels = customers_true_labels[customers_m.index]

    # On transform avec le même scaler
    X_scaled = scaler.transform(data_m[cols])

    # # Plot avec umap pour comparer par rapport à la base
    # umap = favorite_projectors[1]
    # y = umap.fit_transform(X_scaled)
    # plt.figure()
    # sns.scatterplot(x=y.T[0], y=y.T[1], hue=customers_m["segment"])

    # # Elbow plot pour voir si on conserve notre k=4
    # plt.figure()
    # model = KMeans(random_state=0, n_init="auto")
    # visualizer = KElbowVisualizer(model, k=range(2,10), timings=False)
    # visualizer.fit(X_scaled)
    # visualizer.show()

    model.fit(X_scaled)

    # On récupère le score ARI
    scores.append(adjusted_rand_score(customers_labels, model.labels_))

scores

In [None]:
plt.title("ARI scores per month difference")
plt.plot(scores)

In [None]:
scores = []

for d in tqdm(range(30, 90, 7)):
    # On perd des clients qu'on avait auparavant
    data_m = get_data_until(days=d)

    # Les clients correspondant dans 'customers'
    customers_m = customers[customers.customer_unique_id.isin(data_m.customer_unique_id)]
    customers_labels = customers_true_labels[customers_m.index]

    # On transform avec le même scaler
    X_scaled = scaler.transform(data_m[cols])

    # # Plot avec umap pour comparer par rapport à la base
    # umap = favorite_projectors[1]
    # y = umap.fit_transform(X_scaled)
    # plt.figure()
    # sns.scatterplot(x=y.T[0], y=y.T[1], hue=customers_m["segment"])

    # # Elbow plot pour voir si on conserve notre k=4
    # plt.figure()
    # model = KMeans(random_state=0, n_init="auto")
    # visualizer = KElbowVisualizer(model, k=range(2,10), timings=False)
    # visualizer.fit(X_scaled)
    # visualizer.show()

    model.fit(X_scaled)

    # On récupère le score ARI
    scores.append((d, adjusted_rand_score(customers_labels, model.labels_)))

for (d, score) in scores:
    print(f"{d=} ; {score=}")

In [None]:
for (d, score) in scores:
    print(f"{d=} ; {score=}")

In [None]:
get_data_until(days=65)

In [None]:
X = np.array(scores)
days = X[:, 0]
scores_d = X[:, 1]

plt.title("ARI scores per day difference")
plt.plot(days, scores_d)

In [None]:
scores = []

for d in tqdm(range(50, 65, 1)):
    # On perd des clients qu'on avait auparavant
    data_m = get_data_until(days=d)

    # Les clients correspondant dans 'customers'
    customers_m = customers[customers.customer_unique_id.isin(data_m.customer_unique_id)]
    customers_labels = customers_true_labels[customers_m.index]

    # On transform avec le même scaler
    X_scaled = scaler.transform(data_m[cols])

    model.fit(X_scaled)

    # On récupère le score ARI
    scores.append((d, adjusted_rand_score(customers_labels, model.labels_)))

X = np.array(scores)
days = X[:, 0]
scores_d = X[:, 1]

plt.title("ARI scores per day difference")
plt.plot(days, scores_d)

In [None]:
RM = ["recency", "amount"]

In [None]:
X2 = get_data_until(days=56)

In [None]:
X1 = customers[customers.customer_unique_id.isin(X2.customer_unique_id)].reset_index(drop=True)

In [None]:
X2

In [None]:
X1

In [None]:
(X1.customer_unique_id == X2.customer_unique_id).value_counts()

In [None]:
X1[RM].describe()

In [None]:
X2[RM].describe()

In [None]:
X1_std = scaler.transform(X1[RM])
X2_std = scaler.transform(X2[RM])

In [None]:
X1_std

In [None]:
X2_std

In [None]:
model.fit(X1_std)
LX1 = model.labels_
model.fit(X2_std)
LX2 = model.labels_

adjusted_rand_score(LX1, LX2)

In [None]:
pca = PCA(n_components=2, random_state=0)

In [None]:
pca.fit(X1_std)

In [None]:
X1_pca = pca.transform(X1_std)
X2_pca = pca.transform(X2_std)

In [None]:
sns.scatterplot(x=X1_pca.T[0], y=X1_pca.T[1], hue=LX1)

In [None]:
sns.scatterplot(x=X2_pca.T[0], y=X2_pca.T[1], hue=LX2)

In [None]:
customers.columns

In [None]:
cols = ['recency', 'delivery_delay', 'estimation_error',
       'number_of_orders', 'respected_ratio', 'amount', 'lat', 'lng',
       'frequency', 'freight_value', 'review_answer_delay',
       'review_score', 'review_level']

scaler = StandardScaler()

In [None]:
corr = customers[cols].corr()

sns.heatmap(corr)

In [None]:
X = customers[cols]
X_std = scaler.fit_transform(X)

In [None]:
# pca = PCA(n_components=6)
# X_pca = pca.fit_transform(X_std)
# print("Reduced dimension")
projector = TSNE(n_components=2, n_jobs=-1, random_state=0, n_iter_without_progress=200, perplexity=100)
projector.fit(X_std)

In [None]:
projector.embedding_

In [None]:
y = projector.embedding_.T

In [None]:
sns.scatterplot(x=y[0], y=y[1], hue=customers["segment"])

In [None]:
customers.columns

In [None]:
cols = ['amount', 'frequency', 'estimation_error',
       'delivery_delay', 'review_level', '']

In [None]:
X_std = scaler.fit_transform(customers.loc[:10000, cols])

In [None]:
viz = KElbowVisualizer(KMeans(random_state=0, n_init="auto"), k=(2,12), timings=False)
viz.fit(X_std)
viz.show();

In [None]:
model = KMeans(n_clusters=5, random_state=0, n_init="auto")
model.fit(X_std)
labels = model.labels_

In [None]:
#projector = TSNE(n_components=2, n_jobs=-1, random_state=0, perplexity=60, learning_rate=100)
projector = TSNE(n_components=2, n_jobs=-1, random_state=0, perplexity=50, learning_rate=100)
projector.fit(X_std)
y = projector.embedding_.T

In [None]:
#sns.scatterplot(x=y[0], y=y[1], hue=labels)
sns.scatterplot(x=y[0], y=y[1], hue=customers.loc[:10000, "segment"])

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_std)
y = X_pca.T
sns.scatterplot(x=y[0], y=y[1], hue=customers.loc[:10000, "segment"])