In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'svg'")
plt.rcParams['figure.figsize'] = (8, 5)
plt.rcParams['image.cmap'] = 'viridis'

In [10]:
df = pd.read_csv('RFM_ht_data.csv', parse_dates=['InvoiceDate'], low_memory=False)

In [11]:
df

Unnamed: 0,InvoiceNo,CustomerCode,InvoiceDate,Amount
0,C0011810010001,19067290,2020-09-01,1716.00
1,C0011810010017,13233933,2020-09-01,1489.74
2,C0011810010020,99057968,2020-09-01,151.47
3,C0011810010021,80007276,2020-09-01,146.72
4,C0011810010024,13164076,2020-09-01,104.00
...,...,...,...,...
332725,S0081810310459,14092500,2020-09-30,3801.87
332726,S0081810310461,99065678,2020-09-30,5769.88
332727,S0081810310462,19029918,2020-09-30,736.88
332728,S0081810310463,13020033,2020-09-30,1475.20


In [13]:
df.dtypes

InvoiceNo               object
CustomerCode            object
InvoiceDate     datetime64[ns]
Amount                 float64
dtype: object

#### Какое максимальное кол-во покупок было совершено одним пользователем?

In [16]:
df.groupby('CustomerCode', as_index=False).InvoiceNo.count().sort_values('InvoiceNo', ascending=False)

Unnamed: 0,CustomerCode,InvoiceNo
89388,19057820,204
44594,13215452,113
10347,13032521,106
97077,19080880,99
119951,99003061,90
...,...,...
58910,13272861,1
58911,13272871,1
58913,13272875,1
58914,13272878,1


#### RFM analysis

In [30]:
rfmTable = df.groupby('CustomerCode').agg({'InvoiceDate': lambda x: (df.InvoiceDate.max() - x.max()).days, # Recency #Количество дней с последнего заказа
                                        'InvoiceNo': lambda x: len(x),      # Frequency #Количество заказов
                                        'Amount': 'sum'}).reset_index() # Monetary Value #Общая сумма по всем заказам

rfmTable['InvoiceDate'] = rfmTable['InvoiceDate'].astype(int)
rfmTable.rename(columns={'InvoiceDate': 'recency', 
                         'InvoiceNo': 'frequency', 
                         'Amount': 'monetary_value'}, inplace=True)

In [31]:
rfmTable

Unnamed: 0,CustomerCode,recency,frequency,monetary_value
0,02213019,19,1,1609.20
1,02213042,22,3,9685.48
2,02213071,29,1,415.00
3,02213088,23,1,305.00
4,02213092,25,1,1412.88
...,...,...,...,...
123728,99099927,10,1,961.10
123729,99099936,0,1,1521.78
123730,99099959,8,2,1444.56
123731,99099963,19,1,3018.91


In [32]:
quantiles = rfmTable.quantile(q=[0.25, 0.5, 0.75])

In [33]:
rfmSegmentation = rfmTable

In [34]:

def RClass(value,parameter_name,quantiles_table):
    if value <= quantiles_table[parameter_name][0.25]:
        return 1
    elif value <= quantiles_table[parameter_name][0.50]:
        return 2
    elif value <= quantiles_table[parameter_name][0.75]: 
        return 3
    else:
        return 4


def FMClass(value, parameter_name,quantiles_table):
    if value <= quantiles_table[parameter_name][0.25]:
        return 4
    elif value <= quantiles_table[parameter_name][0.50]:
        return 3
    elif value <= quantiles_table[parameter_name][0.75]: 
        return 2
    else:
        return 1


In [43]:
rfmSegmentation['R_Quartile'] = rfmSegmentation['recency'].apply(RClass, args=('recency',quantiles))
rfmSegmentation['F_Quartile'] = rfmSegmentation['frequency'].apply(FMClass, args=('frequency',quantiles))
rfmSegmentation['M_Quartile'] = rfmSegmentation['monetary_value'].apply(FMClass, args=('monetary_value',quantiles))
rfmSegmentation['RFMClass'] = rfmSegmentation.R_Quartile.map(str)+rfmSegmentation.F_Quartile.map(str)\
                                + rfmSegmentation.M_Quartile.map(str)

In [44]:
rfmSegmentation

Unnamed: 0,CustomerCode,recency,frequency,monetary_value,R_Quartile,F_Quartile,M_Quartile,RFMClass
0,02213019,19,1,1609.20,4,4,3,443
1,02213042,22,3,9685.48,4,2,1,421
2,02213071,29,1,415.00,4,4,4,444
3,02213088,23,1,305.00,4,4,4,444
4,02213092,25,1,1412.88,4,4,3,443
...,...,...,...,...,...,...,...,...
123728,99099927,10,1,961.10,3,4,3,343
123729,99099936,0,1,1521.78,1,4,3,143
123730,99099959,8,2,1444.56,2,3,3,233
123731,99099963,19,1,3018.91,4,4,2,442


#### Какая верхняя граница у суммы покупок у пользователей с классом 4 в подсегменте М? (Другими словами: пользователи, у которых сумма покупок от 0 до Х попадают в 4 класс в подсегменте М)

In [45]:
rfmSegmentation[rfmSegmentation.M_Quartile == 4].sort_values('monetary_value', ascending=False)

Unnamed: 0,CustomerCode,recency,frequency,monetary_value,R_Quartile,F_Quartile,M_Quartile,RFMClass
113796,70024801,15,2,765.00,3,3,4,334
94193,19071804,4,1,765.00,2,4,4,244
11025,13036504,13,2,765.00,3,3,4,334
11336,13038952,19,1,765.00,4,4,4,444
107006,35039364,2,1,765.00,1,4,4,144
...,...,...,...,...,...,...,...,...
13696,13052590,11,1,9.50,3,4,4,344
73931,19006298,15,1,5.00,3,4,4,344
35219,13164125,28,1,2.64,4,4,4,444
108808,35076038,17,1,0.00,4,4,4,444


#### Какая нижняя граница у количества покупок у пользователей с классом 1 в подсегменте F?

In [46]:
rfmSegmentation[rfmSegmentation.F_Quartile == 1].sort_values('frequency')

Unnamed: 0,CustomerCode,recency,frequency,monetary_value,R_Quartile,F_Quartile,M_Quartile,RFMClass
6,02213129,1,4,2199.00,1,1,2,112
58850,13272658,3,4,8738.05,2,1,1,211
58870,13272725,0,4,4822.24,1,1,1,111
58882,13272763,7,4,1326.08,2,1,3,213
58886,13272778,0,4,8941.10,1,1,1,111
...,...,...,...,...,...,...,...,...
119951,99003061,0,90,398759.35,1,1,1,111
97077,19080880,27,99,169930.72,4,1,1,411
10347,13032521,0,106,389309.92,1,1,1,111
44594,13215452,0,113,85334.40,1,1,1,111


#### Какое максимальное количество дней может пройти с момента последней покупки для того, чтобы пользователь попал в класс 2 в подсегменте R?

In [47]:
rfmSegmentation[rfmSegmentation.R_Quartile == 2].sort_values('recency', ascending=False)

Unnamed: 0,CustomerCode,recency,frequency,monetary_value,R_Quartile,F_Quartile,M_Quartile,RFMClass
60761,13280815,8,1,1159.33,2,4,3,243
49753,13234867,8,1,1972.00,2,4,2,242
49664,13234527,8,2,1165.00,2,3,3,233
49619,13234312,8,2,650.00,2,3,4,234
49596,13234215,8,1,5106.76,2,4,1,241
...,...,...,...,...,...,...,...,...
71821,18116243,3,1,6029.58,2,4,1,241
44878,13216509,3,2,4634.50,2,3,1,231
71840,18116543,3,5,18656.49,2,1,1,211
99451,19088392,3,8,5676.69,2,1,1,211


#### Сколько пользователей попало в сегмент 111, 311?

In [50]:
rfmSegmentation[rfmSegmentation.RFMClass == '111'].count()

CustomerCode      9705
recency           9705
frequency         9705
monetary_value    9705
R_Quartile        9705
F_Quartile        9705
M_Quartile        9705
RFMClass          9705
dtype: int64

In [51]:
rfmSegmentation[rfmSegmentation.RFMClass == '311'].count()

CustomerCode      1609
recency           1609
frequency         1609
monetary_value    1609
R_Quartile        1609
F_Quartile        1609
M_Quartile        1609
RFMClass          1609
dtype: int64

#### В каком RFM-сегменте самое большое/маленькое кол-во пользователей? Какое количество пользователей попало в самый малочисленный сегмент?

In [55]:
rfmSegmentation.groupby('RFMClass', as_index=False).agg(users=('CustomerCode', 'count')).sort_values('users')

Unnamed: 0,RFMClass,users
51,414,2
35,314,33
3,114,60
19,214,60
55,424,63
...,...,...
16,211,5847
47,344,6593
62,443,6729
0,111,9705
