In [25]:
import pandas as pd

In [26]:
df = pd.read_csv('RFM_ht_data.csv',  dtype = {'InvoiceNo': 'str', 'CustomerCode': 'str'}, parse_dates = ['InvoiceDate'])

In [27]:
df.head()

Unnamed: 0,InvoiceNo,CustomerCode,InvoiceDate,Amount
0,C0011810010001,19067290,2020-09-01,1716.0
1,C0011810010017,13233933,2020-09-01,1489.74
2,C0011810010020,99057968,2020-09-01,151.47
3,C0011810010021,80007276,2020-09-01,146.72
4,C0011810010024,13164076,2020-09-01,104.0


In [28]:
df.dtypes

InvoiceNo               object
CustomerCode            object
InvoiceDate     datetime64[ns]
Amount                 float64
dtype: object

In [29]:
def rfm_segmentation(df, current_date=None):
    # Находим последнюю дату в датасете
    last_date = df['InvoiceDate'].max()
    
    # Если current_date не задана, используем last_date
    if current_date is None:
        current_date = last_date

    # Для каждого пользователя считаем дату последней покупки, количество покупок, сумму покупок
    data = df.groupby('CustomerCode').agg(
        last_purchase=('InvoiceDate', 'max'),
        n_orders=('InvoiceNo', 'nunique'),
        sum_purchases=('Amount', 'sum')
    )

    # Вычисляем количество дней с момента последней покупки
    data['days_since_last_purchase'] = (current_date - data['last_purchase']).dt.days

    # Удаляем колонку last_purchase, чтобы она не была в финальном DataFrame
    data.drop(columns='last_purchase', inplace=True)

    # Для каждого показателя находим процентили
    percentiles = data[['days_since_last_purchase', 'n_orders', 'sum_purchases']].quantile([0.25, 0.5, 0.75])

    # Функция для оценки recency по таблице с процентилями
    def Rclass(x, parameter_name, percentiles_table):
        if x <= percentiles_table.loc[0.25, parameter_name]:
            return 1
        elif x <= percentiles_table.loc[0.5, parameter_name]:
            return 2
        elif x <= percentiles_table.loc[0.75, parameter_name]:
            return 3
        else:
            return 4
    
    data['recency'] = data['days_since_last_purchase'].apply(Rclass, args=('days_since_last_purchase', percentiles))

    # Функция для оценки frequency и monetary по таблице с процентилями
    def FMclass(x, parameter_name, percentiles_table):
        if x <= percentiles_table.loc[0.25, parameter_name]:
            return 4
        elif x <= percentiles_table.loc[0.5, parameter_name]:
            return 3
        elif x <= percentiles_table.loc[0.75, parameter_name]:
            return 2
        else:
            return 1

    data['frequency'] = data['n_orders'].apply(FMclass, args=('n_orders', percentiles))
    data['money'] = data['sum_purchases'].apply(FMclass, args=('sum_purchases', percentiles))

    # Создаем RFM_class как строку из recency, frequency и money
    data['RFM_class'] = data['recency'].astype(str) + data['frequency'].astype(str) + data['money'].astype(str)

    # Устанавливаем CustomerCode в качестве индекса
    data.index.name = 'CustomerCode'

    # Перемещаем нужные колонки на верх
    data = data[['days_since_last_purchase', 'n_orders', 'sum_purchases', 'recency', 'frequency', 'money', 'RFM_class']]

    return data


In [30]:
rfm_segmentation(df)

Unnamed: 0_level_0,days_since_last_purchase,n_orders,sum_purchases,recency,frequency,money,RFM_class
CustomerCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
02213019,19,1,1609.20,4,4,3,443
02213042,22,3,9685.48,4,2,1,421
02213071,29,1,415.00,4,4,4,444
02213088,23,1,305.00,4,4,4,444
02213092,25,1,1412.88,4,4,3,443
...,...,...,...,...,...,...,...
99099927,10,1,961.10,3,4,3,343
99099936,0,1,1521.78,1,4,3,143
99099959,8,2,1444.56,2,3,3,233
99099963,19,1,3018.91,4,4,2,442
