In [1]:
import pandas as pd
import datetime as dt

In [2]:
# Loaded cleaned transactional data.
df = pd.read_csv("clean_retail_data.csv")
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])


In [3]:
# Loaded cleaned transactional data.
df = pd.read_csv("clean_retail_data.csv")
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])


In [4]:
# Defined reference date for recency calculation.
snapshot_date = df["InvoiceDate"].max() + dt.timedelta(days=1)


In [5]:
# Aggregated customer-level RFM metrics.
rfm = df.groupby("CustomerID").agg({
    "InvoiceDate": lambda x: (snapshot_date - x.max()).days,
    "Invoice": "nunique",
    "TotalPrice": "sum"
}).reset_index()

rfm.columns = ["CustomerID", "Recency", "Frequency", "Monetary"]
rfm.head()


Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,12346,326,12,77556.46
1,12347,2,8,5633.32
2,12348,75,5,2019.4
3,12349,19,4,4428.69
4,12350,310,1,334.4


In [9]:
# Generated RFM scores using quantile-based binning.
# Adaptive binning was applied to handle tied values in transactional data.

# Recency score:
# Lower recency indicates more recent activity, so the scale was inverted.
rfm["R"] = pd.qcut(
    rfm["Recency"],
    q=4,
    duplicates="drop"
).cat.codes

rfm["R"] = rfm["R"].max() - rfm["R"] + 1


In [10]:
# Frequency score:
# High frequency is better; ordinal encoding preserves ranking
# when quantile bins collapse due to skew.
rfm["F"] = pd.qcut(
    rfm["Frequency"],
    q=4,
    duplicates="drop"
).cat.codes + 1


In [11]:
# Monetary score:
# High monetary value is better; adaptive quantiles ensure stability.
rfm["M"] = pd.qcut(
    rfm["Monetary"],
    q=4,
    duplicates="drop"
).cat.codes + 1


In [12]:
# Computed composite RFM score for segmentation.
rfm["RFM_Score"] = rfm[["R", "F", "M"]].sum(axis=1)


In [13]:
# Validated RFM score distributions.
rfm[["R", "F", "M"]].describe()


Unnamed: 0,R,F,M
count,5878.0,5878.0,5878.0
mean,2.511058,1.671317,2.5
std,1.122631,0.814095,1.118281
min,1.0,1.0,1.0
25%,2.0,1.0,1.25
50%,3.0,1.0,2.5
75%,4.0,2.0,3.75
max,4.0,3.0,4.0


In [14]:
# Confirmed effective number of frequency bins.
rfm["F"].value_counts().sort_index()


F
1    3231
2    1348
3    1299
Name: count, dtype: int64

In [15]:
# Persisted RFM feature table.
rfm.to_csv("rfm_table.csv", index=False)
