In [20]:
import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import mlflow
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, roc_auc_score, classification_report


In [21]:
# Configurer MLflow pour stockage local
mlflow.set_tracking_uri("file://" + os.path.abspath("./mlruns"))
mlflow.set_experiment("CLV_Predictive_Project")

2025/05/19 01:06:35 INFO mlflow.tracking.fluent: Experiment with name 'CLV_Predictive_Project' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/akaoui/Desktop/Github/ml-projects/CLV/mlruns/757971774486516805', creation_time=1747609595118, experiment_id='757971774486516805', last_update_time=1747609595118, lifecycle_stage='active', name='CLV_Predictive_Project', tags={}>

In [24]:
# -------------------------------------------
# 1. MOCK DATA GENERATION
# -------------------------------------------
def generate_mock_data(n_customers=1000, start_date='2021-01-01', months=36, seed=42):
    """
    1. Génère un DataFrame clients avec date de 1er achat aléatoire
    2. Simule transactions (montant exponentiel + upsell binaire)
    3. Ajoute mois de cohorte = période de first_purchase_date
    """

    np.random.seed(seed)
    customers = pd.DataFrame({'customer_id': range(1, n_customers+1)})
    start = pd.to_datetime(start_date)
    end = start + pd.DateOffset(months=months)
    customers['first_purchase_date'] = customers['customer_id'].apply(
        lambda x: start + timedelta(days=np.random.randint(0, (end-start).days))
    )
    # Cohorte = mois (YYYY-MM) de 1er achat
    customers['cohort_month'] = customers['first_purchase_date'].dt.to_period('M')

    # Simuler transactions
    transactions = []
    for _, row in customers.iterrows():
        cid, first = row['customer_id'], row['first_purchase_date']
        n_tx = np.random.poisson(5)
        for _ in range(n_tx):
            tx_date = first + timedelta(days=np.random.exponential(scale=90))
            if tx_date > end: continue
            amount = np.random.exponential(scale=100)
            upsell = np.random.binomial(1, 0.2)
            transactions.append({'customer_id': cid, 'tx_date': tx_date,
                                 'amount': amount, 'upsell': upsell})
    tx_df = pd.DataFrame(transactions)
    tx_df['tx_date'] = pd.to_datetime(tx_df['tx_date'])
    return customers, tx_df

# Exemple d'utilisation
customers, transactions = generate_mock_data()
#print(f"Clients: {len(customers)}, Transactions: {len(transactions)}")
print("Mock data: {} customers, {} transactions".format(
    customers.shape[0], transactions.shape[0]
))

Mock data: 1000 customers, 4609 transactions


In [25]:
print(customers.head)
display(transactions.head)

<bound method NDFrame.head of      customer_id first_purchase_date cohort_month
0              1          2023-05-11      2023-05
1              2          2023-11-11      2023-11
2              3          2021-05-02      2021-05
3              4          2022-04-12      2022-04
4              5          2021-11-27      2021-11
..           ...                 ...          ...
995          996          2021-05-08      2021-05
996          997          2021-02-11      2021-02
997          998          2021-02-10      2021-02
998          999          2023-07-23      2023-07
999         1000          2021-02-03      2021-02

[1000 rows x 3 columns]>


<bound method NDFrame.head of       customer_id                    tx_date      amount  upsell
0               1 2023-05-24 05:58:09.957665  230.277153       1
1               1 2023-07-31 21:15:49.356088   91.758372       0
2               1 2023-05-28 08:29:44.094693  245.820956       0
3               1 2023-06-23 11:31:09.575574   73.171693       0
4               2 2023-12-06 08:02:24.261277   49.314226       0
...           ...                        ...         ...     ...
4604          999 2023-11-16 08:58:11.165026    5.053437       0
4605         1000 2021-03-05 19:16:42.816585   36.583519       0
4606         1000 2021-04-07 20:50:57.156885    1.177164       0
4607         1000 2021-10-05 03:00:13.647492   37.183147       0
4608         1000 2021-04-01 16:16:29.288050   82.483656       1

[4609 rows x 4 columns]>

In [26]:
# -------------------------------------------
# 2. COHORT SEGMENTATION
# -------------------------------------------
# Compute cohort metrics
cohort_counts = customers.groupby('cohort_month').size().reset_index(name='n_customers')
print(cohort_counts.head)

<bound method NDFrame.head of    cohort_month  n_customers
0       2021-01           35
1       2021-02           30
2       2021-03           26
3       2021-04           33
4       2021-05           31
5       2021-06           25
6       2021-07           23
7       2021-08           27
8       2021-09           23
9       2021-10           19
10      2021-11           19
11      2021-12           23
12      2022-01           23
13      2022-02           23
14      2022-03           23
15      2022-04           27
16      2022-05           30
17      2022-06           23
18      2022-07           28
19      2022-08           20
20      2022-09           23
21      2022-10           33
22      2022-11           30
23      2022-12           33
24      2023-01           32
25      2023-02           28
26      2023-03           39
27      2023-04           33
28      2023-05           36
29      2023-06           25
30      2023-07           27
31      2023-08           33
32      2023-

In [27]:
# -------------------------------------------
# 3. FEATURE ENGINEERING
# -------------------------------------------
def build_features(customers, transactions, observation_end):
    """
    - Filtre transactions <= observation_end
    - Calcule Recency (jours), Frequency (# tx), Monetary (somme)
    - Normalise chaque métrique
    - Compte upsell_count et propension = upsell_count/frequency
    - One-hot encode cohort_month
    """
    # Filter transactions until observation_end
    tx = transactions[transactions['tx_date'] <= observation_end].copy()
    # RFM features
    agg = tx.groupby('customer_id').agg({
        'tx_date': [lambda x: (observation_end - x.max()).days,
                    lambda x: x.count()],
        'amount': 'sum'
    })
    agg.columns = ['recency_days', 'frequency', 'monetary']
    agg = agg.reset_index()
    
    # Normalize
    for col in ['recency_days', 'frequency', 'monetary']:
        agg[f'{col}_norm'] = (agg[col] - agg[col].mean()) / agg[col].std()
    
    # Merge cohort and upsell count
    upsell_count = tx.groupby('customer_id')['upsell'].sum().reset_index(name='upsell_count')
    agg = agg.merge(upsell_count, on='customer_id', how='left').fillna(0)

    # Probability of upsell: upsell_count / frequency
    agg['upsell_propensity'] = agg['upsell_count'] / agg['frequency']
    agg['upsell_propensity'] = agg['upsell_propensity'].fillna(0)

    # Merge cohort
    agg = agg.merge(customers[['customer_id', 'cohort_month']], on='customer_id', how='left')
    # One-hot encode cohorts
    cohort_dummies = pd.get_dummies(agg['cohort_month'].astype(str), prefix='cohort')
    agg = pd.concat([agg, cohort_dummies], axis=1)
    
    return agg

# Build features as of a cutoff date
observation_end = datetime(2023, 12, 31)
features = build_features(customers, transactions, observation_end)
print("Features shape:", features.shape)

Features shape: (972, 46)


In [28]:
# -------------------------------------------
# 4. TARGET DEFINITION: FUTURE CLV
# -------------------------------------------

def compute_targets(transactions, observation_end, horizon_days=365):
    """
    - CLV: somme des montants entre (observation_end, observation_end+horizon)
    - upsell_next_year: 1 si au moins un upsell durant cette période
    """
    start = observation_end
    end = observation_end + timedelta(days=horizon_days)
    future = transactions[(transactions['tx_date']>start)&(transactions['tx_date']<=end)]
    clv = future.groupby('customer_id')['amount'].sum().reset_index(name='future_clv')
    ups = future.groupby('customer_id')['upsell'].max().reset_index(name='upsell_next_year')
    return clv.merge(ups, on='customer_id', how='outer').fillna(0)

targets = compute_targets(transactions, observation_end)
print(targets.head())
print("Target distribution:", targets['future_clv'].describe())


   customer_id  future_clv  upsell_next_year
0          175    35.18473                 0
Target distribution: count     1.00000
mean     35.18473
std           NaN
min      35.18473
25%      35.18473
50%      35.18473
75%      35.18473
max      35.18473
Name: future_clv, dtype: float64


In [30]:
# -------------------------------------------
# 5. MODEL TRAINING: PREDICTIVE CLV
# -------------------------------------------

def train_clv_model(features, targets):
    X = features.drop(columns=['customer_id'])
    y = targets['future_clv']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    with mlflow.start_run(run_name='train_clv'):
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(X_train, y_train)
        preds = rf.predict(X_test)
        mse = mean_squared_error(y_test, preds)
        mlflow.log_metric('clv_mse', mse)
        mlflow.sklearn.log_model(rf, 'model_clv')
        print(f"[CLV] MSE: {mse:.2f}")
    return rf
# -------------------------------------------
# 6. UPSALE PROPENSITY CLASSIFICATION
# -------------------------------------------

def train_upsell_model(features, targets):
    X = features.drop(columns=['customer_id'])
    y = targets['upsell_next_year']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    with mlflow.start_run(run_name='train_upsell'):
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train, y_train)
        probs = clf.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, probs)
        mlflow.log_metric('upsell_auc', auc)
        mlflow.sklearn.log_model(clf, 'model_upsell')
        print(f"[Upsell] AUC: {auc:.2f}")
    return clf

# Lancement des trainings
clv_model = train_clv_model(features, targets)
upsell_model = train_upsell_model(features, targets)


ValueError: Found input variables with inconsistent numbers of samples: [972, 1]

In [None]:
# -------------------------------------------
# 7. CONCLUSION ET SUITES POSSIBLES
# -------------------------------------------
# - Affiner le modèle avec hyperopt ou GridSearchCV
# - Intégration MLflow
# - Validation par retour terrain (expérimentations A/B)

In [None]:
# -------------------------------------------
# 5. MLOPS LOCAL SANS GCP
# -------------------------------------------

# 5.1 Containerisation : créer un Dockerfile
# ------------------------------------------------
# FROM python:3.9-slim
# WORKDIR /app
# COPY . /app
# RUN pip install -r requirements.txt
# ENTRYPOINT ["python", "clv_predictive_mlops_project.py"]

# 5.2 Orchestration : exemple de DAG Airflow
# ------------------------------------------------
# Placez ce fichier dans AIRFLOW_HOME/dags/clv_dag.py
# -------------------------------------------
# from airflow import DAG
# from airflow.operators.python import PythonOperator
# from datetime import datetime, timedelta
# 
# default_args = {'start_date': datetime(2023,1,1), 'retries': 1, 'retry_delay': timedelta(minutes=5)}
# with DAG('clv_pipeline', schedule_interval='@monthly', default_args=default_args) as dag:
#     task_gen = PythonOperator(task_id='generate_data', python_callable=generate_mock_data)
#     task_feat = PythonOperator(task_id='build_features', python_callable=build_features,
#                                 op_kwargs={'customers': customers, 'transactions': transactions, 'observation_end': observation_end})
#     task_target = PythonOperator(task_id='compute_targets', python_callable=compute_targets,
#                                 op_kwargs={'transactions': transactions, 'observation_end': observation_end})
#     task_clv = PythonOperator(task_id='train_clv', python_callable=train_clv_model,
#                               op_kwargs={'features': features, 'targets': targets})
#     task_upsell = PythonOperator(task_id='train_upsell', python_callable=train_upsell_model,
#                                  op_kwargs={'features': features, 'targets': targets})
#     task_gen >> task_feat >> task_target >> [task_clv, task_upsell]

# 5.3 Projet MLflow : MLproject (à placer racine)
# ------------------------------------------------
# name: CLV_Predictive_Project
# conda_env: conda.yaml
# entry_points:
#   main:
#     command: "python clv_predictive_mlops_project.py"

# 5.4 Déploiement local de modèle via MLflow
# ------------------------------------------------
# mlflow models serve -m ./mlruns/0/xxx/artifacts/model_clv -p 1234

# 5.5 Monitoring : Evidently pour drift detection
# ------------------------------------------------
# Installer: pip install evidently
# Créer un dashboard dans un notebook ou service web

# Fin du projet MLOps local