In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../../data/df_rfm.csv")

# Display the first few rows of the data
df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer_ID,Country,TotalSales
0,489434,85048,15CM_CHRISTMAS_GLASS_BALL_20_LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United_Kingdom,83.40
1,489434,22041,"RECORD_FRAME_7""_SINGLE_SIZE",48,2009-12-01 07:45:00,2.10,13085,United_Kingdom,100.80
2,489434,21232,STRAWBERRY_CERAMIC_TRINKET_BOX,24,2009-12-01 07:45:00,1.25,13085,United_Kingdom,30.00
3,489434,22064,PINK_DOUGHNUT_TRINKET_POT,24,2009-12-01 07:45:00,1.65,13085,United_Kingdom,39.60
4,489434,21871,SAVE_THE_PLANET_MUG,24,2009-12-01 07:45:00,1.25,13085,United_Kingdom,30.00
...,...,...,...,...,...,...,...,...,...
689173,581587,22613,PACK_OF_20_SPACEBOY_NAPKINS,12,2011-12-09 12:50:00,0.85,12680,France,10.20
689174,581587,22899,CHILDREN'S_APRON_DOLLY_GIRL,6,2011-12-09 12:50:00,2.10,12680,France,12.60
689175,581587,23254,CHILDRENS_CUTLERY_DOLLY_GIRL,4,2011-12-09 12:50:00,4.15,12680,France,16.60
689176,581587,23255,CHILDRENS_CUTLERY_CIRCUS_PARADE,4,2011-12-09 12:50:00,4.15,12680,France,16.60


In [3]:
# montre moi toute les lignes de CUSTOMER_ID = 12346

df[df['Customer_ID'] == 12346]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer_ID,Country,TotalSales
59832,499763,20682,RED_SPOTTY_CHILDS_UMBRELLA,1,2010-03-02 13:08:00,3.25,12346,United_Kingdom,3.25
59833,499763,20679,EDWARDIAN_PARASOL_RED,1,2010-03-02 13:08:00,5.95,12346,United_Kingdom,5.95
153054,513774,21524,DOORMAT_SPOTTY_HOME_SWEET_HOME,1,2010-06-28 13:53:00,7.49,12346,United_Kingdom,7.49
153055,513774,22692,DOORMAT_WELCOME_TO_OUR_HOME,1,2010-06-28 13:53:00,7.49,12346,United_Kingdom,7.49
153056,513774,22660,DOORMAT_I_LOVE_LONDON,1,2010-06-28 13:53:00,7.49,12346,United_Kingdom,7.49
153057,513774,22687,DOORMAT_CHRISTMAS_VILLAGE,1,2010-06-28 13:53:00,7.49,12346,United_Kingdom,7.49
153058,513774,22691,DOORMAT_WELCOME_SUNRISE,1,2010-06-28 13:53:00,7.49,12346,United_Kingdom,7.49
153059,513774,48111,DOORMAT_3_SMILEY_CATS,1,2010-06-28 13:53:00,7.49,12346,United_Kingdom,7.49
153060,513774,22690,DOORMAT_HOME_SWEET_HOME_BLUE,1,2010-06-28 13:53:00,7.49,12346,United_Kingdom,7.49
153061,513774,21523,DOORMAT_FANCY_FONT_HOME_SWEET_HOME,1,2010-06-28 13:53:00,7.49,12346,United_Kingdom,7.49


In [4]:
from datetime import timedelta

# Convertir InvoiceDate de l'objet au format datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Obtenir la date maximale
snapshot_date = df['InvoiceDate'].max() + timedelta(days=1)

# Agréger les données (RFM : Récence, Fréquence, Valeur monétaire)
rfm = df.groupby(['Customer_ID']).agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,  # Calcul de la récence
    'Invoice': 'count',  # Comptage du nombre de transactions (fréquence)
    'TotalSales': 'sum'  # Somme des ventes (valeur monétaire)
})

# Renommer les colonnes
rfm.rename(columns={'InvoiceDate': 'Recency',
                   'Invoice': 'Frequency',
                   'TotalSales': 'MonetaryValue'}, inplace=True)

# Afficher les premières lignes des données
rfm.head()



Unnamed: 0_level_0,Recency,Frequency,MonetaryValue
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,529,20,144.02
12347,2,210,4141.22
12348,75,46,1658.40
12349,19,151,3001.96
12350,310,13,258.00
...,...,...,...
18283,4,922,2478.85
18284,432,25,391.28
18285,661,9,331.20
18286,477,53,886.35


In [5]:
# ajoute les country pour chaque customer_id
df_country = df[['Customer_ID', 'Country']].drop_duplicates()

# Merge the two dataframes
rfm = pd.merge(rfm, df_country, on='Customer_ID')

# Display the first few rows of the data
rfm

Unnamed: 0,Customer_ID,Recency,Frequency,MonetaryValue,Country
0,12346,529,20,144.02,United_Kingdom
1,12347,2,210,4141.22,Iceland
2,12348,75,46,1658.40,Finland
3,12349,19,151,3001.96,Italy
4,12350,310,13,258.00,Norway
...,...,...,...,...,...
5758,18283,4,922,2478.85,United_Kingdom
5759,18284,432,25,391.28,United_Kingdom
5760,18285,661,9,331.20,United_Kingdom
5761,18286,477,53,886.35,United_Kingdom


In [6]:
# met le rfm dans un fichier csv
rfm.to_csv("../../data/df_rfm_model.csv" , index = 'Customer_ID')