# Prédiction de la probabilité de réachat

L’objectif de ce notebook est de développer un modèle supervisé permettant
d’anticiper la probabilité qu’un client effectue un nouvel achat, à partir
de ses comportements passés et de son segment d’appartenance.

Ce modèle vise à orienter les actions marketing et commerciales, notamment
les campagnes de fidélisation et de réactivation.


In [1]:
import pandas as pd
from pathlib import Path

PROJECT_DIR = Path.cwd().parent
INTERIM_DIR = PROJECT_DIR / "data" / "interim"

orders_clean = pd.read_parquet(INTERIM_DIR / "orders_clean.parquet")
client_clusters = pd.read_parquet(
    INTERIM_DIR / "client_features_with_clusters.parquet"
)


In [2]:
# calcule du nombre de commande par clients
orders_per_client = (
    orders_clean
    .groupby("customer_unique_id")
    .size()
    .reset_index(name="nb_orders")
)


In [3]:
# creationde la cible
orders_per_client["repurchase"] = (
    orders_per_client["nb_orders"] > 1
).astype(int)

orders_per_client["repurchase"].value_counts(normalize=True)


repurchase
0    0.969997
1    0.030003
Name: proportion, dtype: float64

In [4]:
# construction du dataset de modelisation
repurchase_data = (
    client_clusters
    .merge(
        orders_per_client[["customer_unique_id", "repurchase"]],
        on="customer_unique_id",
        how="left"
    )
)

repurchase_data["repurchase"] = repurchase_data["repurchase"].fillna(0)
repurchase_data.shape


(93358, 11)

In [5]:
# definition de X et Y
y = repurchase_data["repurchase"]


In [6]:
# varaibles explicatives 
features_repurchase = [
    "cluster",
    "recency",
    "frequency",
    "monetary",
    "avg_basket",
    "avg_review_score",
    "avg_delivery_delay",
    "nb_categories"
]

X = repurchase_data[features_repurchase]


In [7]:
# Traine=test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# Modélisation

In [9]:
# regression logistique pondéréé
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42
)

log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)
y_proba_lr = log_reg.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_lr))
print("ROC AUC :", roc_auc_score(y_test, y_proba_lr))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18112
           1       1.00      1.00      1.00       560

    accuracy                           1.00     18672
   macro avg       1.00      1.00      1.00     18672
weighted avg       1.00      1.00      1.00     18672

ROC AUC : 1.0


In [10]:
# Random Forest pondéré
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=50,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_rf))
print("ROC AUC :", roc_auc_score(y_test, y_proba_rf))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18112
           1       1.00      1.00      1.00       560

    accuracy                           1.00     18672
   macro avg       1.00      1.00      1.00     18672
weighted avg       1.00      1.00      1.00     18672

ROC AUC : 1.0


Option 1

In [11]:
cutoff_date = pd.Timestamp("2018-06-01")

# Commandes après T
orders_after_T = orders[
    orders["order_purchase_timestamp"] > cutoff_date
]

# Clients qui rachètent après T
repurchasers = (
    orders_after_T
    .merge(customers[["customer_id", "customer_unique_id"]], on="customer_id")
    ["customer_unique_id"]
    .unique()
)

# Création de la cible
client_features["repurchase_after_T"] = (
    client_features["customer_unique_id"]
    .isin(repurchasers)
    .astype(int)
)

client_features["repurchase_after_T"].value_counts(normalize=True)


NameError: name 'orders' is not defined

In [None]:
orders_before_T = orders[
    orders["order_purchase_timestamp"] <= cutoff_date
]


In [None]:
X = features clients calculées AVANT T
y = repurchase_after_T
