In [1]:
# Liblary

# Basic Operation
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

from statsmodels.stats.weightstats import ztest

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("data/rfm_data.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CustomerID          1000 non-null   int64  
 1   PurchaseDate        1000 non-null   object 
 2   TransactionAmount   1000 non-null   float64
 3   ProductInformation  1000 non-null   object 
 4   OrderID             1000 non-null   int64  
 5   Location            1000 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 47.0+ KB


## PREPROCESSING
### GET RFM VALUES

In [6]:
# Check last date
max_date = pd.to_datetime(df["PurchaseDate"].max())

In [7]:
# Insert to dataframe
df["LastPurchase"] = max_date

In [8]:
# Change to datetime
df["PurchaseDate"] = pd.to_datetime(df["PurchaseDate"])

In [9]:
# Get recency
df["Recency"] = df["LastPurchase"] - df["PurchaseDate"]
df["Recency"] = df["Recency"].dt.days

In [10]:
# Get frequency
freq = df.groupby("CustomerID", as_index=False)["OrderID"].count()
freq = freq.rename(columns={"OrderID":"Frequency"})

In [11]:
# Merge Frequency vals to original af
df_new = pd.merge(df, freq, on ="CustomerID")

In [12]:
# get Monetary
mone = df.groupby("CustomerID", as_index =False)["TransactionAmount"].sum()
mone = mone.rename(columns={"TransactionAmount":"Monetary"})

In [13]:
# merge monetary vals to original af
df_rfm = pd.merge(df_new,mone,on ="CustomerID")

In [14]:
df_rfm.head()

Unnamed: 0,CustomerID,PurchaseDate,TransactionAmount,ProductInformation,OrderID,Location,LastPurchase,Recency,Frequency,Monetary
0,8814,2023-04-11,943.31,Product C,890075,Tokyo,2023-06-10,60,1,943.31
1,2188,2023-04-11,463.7,Product A,176819,London,2023-06-10,60,1,463.7
2,4608,2023-04-11,80.28,Product A,340062,New York,2023-06-10,60,1,80.28
3,2559,2023-04-11,221.29,Product A,239145,London,2023-06-10,60,1,221.29
4,9482,2023-04-11,739.56,Product A,194545,Paris,2023-06-10,60,1,739.56


In [15]:
df_rfm.to_csv("result/df_rfm_cleaned.csv")

In [17]:
# Check quantile score
df_rfm[["Recency", "Frequency", "Monetary"]].quantile([0.25, 0.5, 0.75])

Unnamed: 0,Recency,Frequency,Monetary
0.25,15.0,1.0,283.925
0.5,32.0,1.0,566.71
0.75,45.0,1.0,805.3725


In [18]:
# Check quantile score
df_rfm[["Recency", "Frequency", "Monetary"]].quantile([0.2, 0.4, 0.6, 0.8])

Unnamed: 0,Recency,Frequency,Monetary
0.2,12.0,1.0,224.45
0.4,25.0,1.0,456.494
0.6,37.0,1.0,668.238
0.8,47.2,1.0,848.916


![](https://blog.rsquaredacademy.com/img/rfm_segments_table.png)