## Create RFM dataset

In [1]:
# Import libraries
import numpy as np 
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
# from sklearn.preprocessing import StandardScaler  <--  this is to normalize data before applying Kmeans
from sklearn.cluster import KMeans

%matplotlib ipympl


In [2]:
# Import clean_data
customer_data = pd.read_excel(r'C:\Users\argyr\OneDrive\Υπολογιστής\Diplvm\Giftware dataset\clean_data.xlsx')
customer_data.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01,2.1,13085,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01,1.25,13085,United Kingdom


In [3]:
customer_data.shape

(800909, 8)

In [4]:
# In order to calculate the monetary value for each customer we first need to create 'TotalValue' column
customer_data["TotalValue"] = customer_data["Quantity"] * customer_data["Price"]

In [5]:
# In order to calculate the recency value we need to find out which was the last transaction date
last_transaction_date = customer_data["InvoiceDate"].max()
print(last_transaction_date)

2011-12-09 00:00:00


In [6]:
# Set date 12/10/20211, which is one day after the last transaction date, as today_date
today_date = dt.datetime(2011,12,10)

In [7]:
# Group by Customer ID and calculate RFM values
rfm = customer_data.groupby(["Customer ID"], as_index=False).agg(
                            {"InvoiceDate": lambda x :(today_date-x.max()).days,
                               "Invoice": lambda x : x.count(),
                               "TotalValue":lambda x : x.sum()})

In [8]:
# Rename columns accordingly
rfm.columns = ["Customer ID", "Recency", "Frequency", "Monetary"]
rfm.head()

Unnamed: 0,Customer ID,Recency,Frequency,Monetary
0,12346,432,34,368.36
1,12347,3,222,4921.53
2,12348,76,51,2019.4
3,12349,19,179,4419.49
4,12350,311,17,334.4


In [9]:
rfm["Recency"] = rfm["Recency"].astype(float)
rfm["Frequency"] = rfm["Frequency"].astype(float)
rfm["Monetary"] = rfm["Monetary"].astype(float).round(2)

In [10]:
rfm.shape

(5848, 4)

In [11]:
rfm_values = rfm.to_csv(r'C:\Users\argyr\OneDrive\Υπολογιστής\Diplvm\Giftware dataset\rfm_values.csv', index=False)