In [None]:
import numpy as np
import pandas as pd
import os
import sqlite3
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import datetime
import matplotlib.pyplot as plt

In [None]:
rfm_data = pd.read_csv('/kaggle/input/bigot-benjamin-2-notebook-exploration-012025/customers_features.csv')

display(rfm_data.head())
display(rfm_data.info())

In [None]:
db_path = "/kaggle/input/olist-database/olist.db"

conn = sqlite3.connect(db_path)

tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql_query(tables_query, conn)
display("Tables disponibles dans la base de données :", tables)

customers_data = pd.read_sql_query("SELECT * FROM customers", conn)
orders_data = pd.read_sql_query("SELECT * FROM orders", conn)

In [None]:
def create_features_for_date(conn, date_limit):
    """
    Génère les statistiques RFM et la moyenne des scores des avis pour une date limite donnée 
    en filtrant les commandes dans la base SQLite.
    """
    query = f"""
    SELECT
        customers.customer_id,
        MAX(orders.order_purchase_timestamp) AS last_order_date,
        COUNT(orders.order_id) AS frequency,
        SUM(order_pymts.payment_value) AS monetary,
        AVG(order_reviews.review_score) AS avg_review_score
    FROM customers
    JOIN orders ON customers.customer_id = orders.customer_id
    JOIN order_pymts ON orders.order_id = order_pymts.order_id
    LEFT JOIN order_reviews ON orders.order_id = order_reviews.order_id
    WHERE orders.order_purchase_timestamp <= '{date_limit}'
    GROUP BY customers.customer_id
    """
    rfm = pd.read_sql_query(query, conn)
    
    # Convertir la date de la dernière commande en "Recency" (jours depuis la dernière commande)
    date_limit_dt = pd.to_datetime(date_limit)
    rfm['last_order_date'] = pd.to_datetime(rfm['last_order_date'])
    rfm['recency'] = (date_limit_dt - rfm['last_order_date']).dt.days

    # Garder uniquement les colonnes RFM et la moyenne des scores d'avis
    rfm = rfm[['recency', 'frequency', 'monetary', 'avg_review_score']]
    
    return rfm

In [None]:
query = """
SELECT 
    MIN(order_purchase_timestamp) AS first_order_date,
    MAX(order_purchase_timestamp) AS last_order_date
FROM orders;
"""

dates = pd.read_sql_query(query, conn)
display(dates)

In [None]:
# Définir la date de départ T0
start_date = "2018-04-17"

# Générer les statistiques RFM pour T0
rfm_F0 = create_features_for_date(conn, start_date)
rfm_F0['avg_review_score'].fillna(rfm_F0['avg_review_score'].mean(), inplace=True)

# Standardiser les données RFM
scaler_M0 = StandardScaler().fit(rfm_F0[['recency', 'frequency', 'monetary', 'avg_review_score']])
rfm_F0_scaled = scaler_M0.transform(rfm_F0[['recency', 'frequency', 'monetary', 'avg_review_score']])

# Entraîner le modèle KMeans initial (M0)
kmeans_M0 = KMeans(n_clusters=5, random_state=42).fit(rfm_F0_scaled)

In [None]:
# Définir la plage temporelle pour les simulations
end_date = "2018-10-17"
date_interval = datetime.timedelta(days=7)

current_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
ari_scores = []

# Itérer sur chaque intervalle de temps (1 semaine)
while current_date <= end_date_dt:
    print(f"Calcul des clusters pour la date {current_date.strftime('%Y-%m-%d')}")

    # Générer les statistiques RFM + avg_review_score pour la date actuelle
    rfm_Fi = create_features_for_date(conn, current_date.strftime("%Y-%m-%d"))

    # Vérification et traitement des valeurs manquantes pour avg_review_score
    if 'avg_review_score' in rfm_Fi.columns:
        rfm_Fi['avg_review_score'].fillna(rfm_Fi['avg_review_score'].mean(), inplace=True)
    else:
        rfm_Fi['avg_review_score'] = rfm_Fi['avg_review_score'].mean()  # Gestion si la colonne manque

    # Standardiser les données avec le scaler initial M0
    rfm_Fi_scaled = scaler_M0.transform(rfm_Fi[['recency', 'frequency', 'monetary', 'avg_review_score']])

    # Prédiction avec le modèle initial M0
    Ci_init = kmeans_M0.predict(rfm_Fi_scaled)

    # Entraîner un nouveau modèle KMeans sur les données actuelles
    kmeans_M1 = KMeans(n_clusters=5, random_state=42).fit(rfm_Fi_scaled)
    Ci_new = kmeans_M1.labels_

    # Calculer l'ARI entre les clusters initiaux (M0) et les clusters nouvellement entraînés (M1)
    ari = adjusted_rand_score(Ci_init, Ci_new)
    ari_scores.append((current_date.strftime("%Y-%m-%d"), ari))
    print(f"Date : {current_date.strftime('%Y-%m-%d')} | ARI : {ari:.4f}")

    # Avancer à la semaine suivante
    current_date += date_interval

# Afficher les résultats de l'ARI
ari_df = pd.DataFrame(ari_scores, columns=["Date", "ARI"])
plt.figure(figsize=(12, 6))
plt.plot(ari_df["Date"], ari_df["ARI"], marker="o", label="ARI")
plt.axhline(y=0.8, color="r", linestyle="--", label="Seuil ARI = 0.8")
plt.xticks(rotation=45)
plt.xlabel("Date")
plt.ylabel("ARI")
plt.title("Évolution de l'ARI dans le temps")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
dates, scores = zip(*ari_scores)

# Tracer l'évolution de l'ARI
plt.figure(figsize=(10, 6))
plt.plot(dates, scores, marker='o')
plt.axhline(0.8, color='r', linestyle='--', label="Seuil ARI = 0.8")
plt.title("Évolution de l'ARI dans le temps")
plt.xlabel("Date")
plt.ylabel("ARI")
plt.xticks(rotation=90)
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()