## Create RFM dataset

In [1]:
# Import libraries
import numpy as np 
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

In [2]:
# Import clean_data
customer_data = pd.read_excel(r'C:\Users\argyr\OneDrive\Υπολογιστής\Diplvm\Giftware dataset\clean_data.xlsx')
customer_data.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01,2.1,13085,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01,1.25,13085,United Kingdom


In [3]:
customer_data.shape

(800909, 8)

In [11]:
# In order to calculate the monetary value for each customer we first need to create 'TotalValue' column
customer_data["TotalValue"] = customer_data["Quantity"] * customer_data["Price"]

In [12]:
# In order to calculate the recency value we need to find out which was the last transaction date
last_transaction_date = customer_data["InvoiceDate"].max()
print(last_transaction_date)

2011-12-09 00:00:00


In [13]:
# Set date 12/10/20211, which is one day after the last transaction date, as today_date
today_date = dt.datetime(2011,12,10)

In [14]:
# Goup by Customer ID and calculate RFM values
rfm = customer_data.groupby("Customer ID").agg({"InvoiceDate": lambda x :(today_date-x.max()).days,
                               "Invoice": lambda x : x.nunique(),
                               "TotalValue":lambda x : x.sum()})
rfm.head()

Unnamed: 0_level_0,InvoiceDate,Invoice,TotalValue
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,432,12,368.36
12347,3,8,4921.53
12348,76,5,2019.4
12349,19,5,4419.49
12350,311,1,334.4


In [15]:
# Rename columns accordingly
rfm.columns = ["Recency", "Frequency", "Monetary"]
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,432,12,368.36
12347,3,8,4921.53
12348,76,5,2019.4
12349,19,5,4419.49
12350,311,1,334.4


In [16]:
rfm.shape

(5848, 3)

### Create RFM score

In [21]:
# Claculate RFM scores
# Using pd.qcut() categorize each customer into 5 bins based on their value in each one of the metrics R,F,M
rfm["Recency_Score"] = pd.qcut(rfm["Recency"], 5, labels = [5, 4 , 3, 2, 1])
rfm["Frequency_Score"]= pd.qcut(rfm["Frequency"].rank(method="first"),5, labels = [1, 2, 3, 4, 5])
rfm["Monetary_Score"] = pd.qcut(rfm['Monetary'], 5, labels = [1, 2, 3, 4, 5])
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,Recency_Score,Frequency_Score,Monetary_Score
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12346,432,12,368.36,1,5,2
12347,3,8,4921.53,5,4,5
12348,76,5,2019.4,3,3,4
12349,19,5,4419.49,5,3,5
12350,311,1,334.4,2,1,2


In [22]:
rfm["RFM_Score"] = rfm["Recency_Score"].astype(str) + rfm["Frequency_Score"].astype(str) + rfm["Monetary_Score"].astype(str)
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,Recency_Score,Frequency_Score,Monetary_Score,RFM_Score
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346,432,12,368.36,1,5,2,152
12347,3,8,4921.53,5,4,5,545
12348,76,5,2019.4,3,3,4,334
12349,19,5,4419.49,5,3,5,535
12350,311,1,334.4,2,1,2,212
