## Import Libraries

In [1]:
# Cell 1: Import Libraries
import pandas as pd
import os
import datetime as dt
print("Libraries imported successfully.")

Libraries imported successfully.


## Load Cleaned Data

In [2]:
# Cell 2: Load Cleaned Data
# Define the path to your cleaned data file in the 'data' folder
data_folder = '../data' # Relative path from notebooks to data folder
cleaned_file_name = 'online_retail_dataset_preprocessing.xlsx'
cleaned_file_path = os.path.join(data_folder, cleaned_file_name)

print(f"Loading cleaned data from: {cleaned_file_path}")

# Load the cleaned DataFrame
df_cleaned = pd.read_excel(cleaned_file_path)

print(f"Cleaned data loaded. Shape: {df_cleaned.shape}")
print("First 5 rows of cleaned data (with correct TotalPrice):")
display(df_cleaned.head())

Loading cleaned data from: ../data\online_retail_dataset_preprocessing.xlsx
Cleaned data loaded. Shape: (779425, 9)
First 5 rows of cleaned data (with correct TotalPrice):


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,TotalPrice
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.0


## Determine Snapshot Date for RFM Analysis

In [3]:
# Cell 3: Determine Snapshot Date for RFM Analysis
# To calculate Recency, we need a "snapshot date" or "analysis date".
# This is typically one day after the last transaction date in the dataset.
snapshot_date = df_cleaned['InvoiceDate'].max() + dt.timedelta(days=1)

print(f"The latest invoice date in the dataset is: {df_cleaned['InvoiceDate'].max()}")
print(f"The RFM analysis snapshot date is: {snapshot_date}")

The latest invoice date in the dataset is: 2011-12-09 12:50:00
The RFM analysis snapshot date is: 2011-12-10 12:50:00


## Calculate Recency

In [5]:
# Calculate Recency
# Recency is calculated as the number of days between the snapshot date and the customer's last purchase date.
# A smaller Recency value indicates a more recently active customer.

# Group by CustomerID and find the maximum InvoiceDate for each customer (last purchase date)
recency_df = df_cleaned.groupby('Customer ID')['InvoiceDate'].max().reset_index()
recency_df.columns = ['Customer ID', 'LastPurchaseDate']

# Calculate Recency in days
recency_df['Recency'] = (snapshot_date - recency_df['LastPurchaseDate']).dt.days

print("Recency calculation complete. First 5 rows:")
display(recency_df.head())

Recency calculation complete. First 5 rows:


Unnamed: 0,Customer ID,LastPurchaseDate,Recency
0,12346.0,2011-01-18 10:01:00,326
1,12347.0,2011-12-07 15:52:00,2
2,12348.0,2011-09-25 13:13:00,75
3,12349.0,2011-11-21 09:51:00,19
4,12350.0,2011-02-02 16:01:00,310


In [9]:
recency_df['Recency'].max()

np.int64(739)

In [10]:
recency_df['Recency'].min()

np.int64(1)

In [11]:
recency_df['Recency'].mean()

np.float64(201.331915617557)

In [20]:
recency_df.shape

(5878, 3)

## Calculate Frequency

In [6]:
# Calculate Frequency
# Frequency is the number of unique purchase instances (invoices) for each customer.
# A higher Frequency indicates a more frequent buyer.
# We count unique Invoice numbers for each customer.

frequency_df = df_cleaned.groupby('Customer ID')['Invoice'].nunique().reset_index() # Corrected to InvoiceNo
frequency_df.columns = ['Customer ID', 'Frequency']

print("Frequency calculation complete. First 5 rows:")
display(frequency_df.head())

Frequency calculation complete. First 5 rows:


Unnamed: 0,Customer ID,Frequency
0,12346.0,12
1,12347.0,8
2,12348.0,5
3,12349.0,4
4,12350.0,1


In [8]:
frequency_df['Frequency'].min()

np.int64(1)

In [7]:
frequency_df['Frequency'].max()

np.int64(398)

In [12]:
frequency_df['Frequency'].mean()

np.float64(6.289384144266758)

In [19]:
frequency_df.shape

(5878, 2)

## Calculate Monetary

In [13]:
# Cell 6: Calculate Monetary
# Monetary is the total amount of money spent by each customer.
# A higher Monetary value indicates a customer who spends more.
# We sum the 'TotalPrice' for each customer, which is derived from Quantity * UnitPrice.

monetary_df = df_cleaned.groupby('Customer ID')['TotalPrice'].sum().reset_index()
monetary_df.columns = ['Customer ID', 'Monetary']

print("Monetary calculation complete. First 5 rows:")
display(monetary_df.head())

Monetary calculation complete. First 5 rows:


Unnamed: 0,Customer ID,Monetary
0,12346.0,77556.46
1,12347.0,4921.53
2,12348.0,2019.4
3,12349.0,4428.69
4,12350.0,334.4


In [18]:
monetary_df.shape

(5878, 2)

In [14]:
monetary_df['Monetary'].max()

np.float64(580987.04)

In [15]:
monetary_df['Monetary'].min()

np.float64(2.95)

In [16]:
monetary_df['Monetary'].mean()

np.float64(2955.9040945899965)

## Combine RFM Features

In [22]:
# Combine RFM Features
# Merge Recency, Frequency, and Monetary DataFrames into a single RFM DataFrame.

# Start with Recency DataFrame
rfm_df = recency_df[['Customer ID', 'Recency']]

# Merge with Frequency DataFrame
rfm_df = pd.merge(rfm_df, frequency_df, on='Customer ID', how='left')

# Merge with Monetary DataFrame
rfm_df = pd.merge(rfm_df, monetary_df, on='Customer ID', how='left')

print("Combined RFM features:")
display(rfm_df.head())
print(f"Shape of RFM DataFrame: {rfm_df.shape}")
print("RFM DataFrame Info:")
rfm_df.info()
print("\nDescriptive statistics of RFM features:")
display(rfm_df.describe())

Combined RFM features:


Unnamed: 0,Customer ID,Recency,Frequency,Monetary
0,12346.0,326,12,77556.46
1,12347.0,2,8,4921.53
2,12348.0,75,5,2019.4
3,12349.0,19,4,4428.69
4,12350.0,310,1,334.4


Shape of RFM DataFrame: (5878, 4)
RFM DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5878 entries, 0 to 5877
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Customer ID  5878 non-null   float64
 1   Recency      5878 non-null   int64  
 2   Frequency    5878 non-null   int64  
 3   Monetary     5878 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 183.8 KB

Descriptive statistics of RFM features:


Unnamed: 0,Customer ID,Recency,Frequency,Monetary
count,5878.0,5878.0,5878.0,5878.0
mean,15315.313542,201.331916,6.289384,2955.904095
std,1715.572666,209.338707,13.009406,14440.852688
min,12346.0,1.0,1.0,2.95
25%,13833.25,26.0,1.0,342.28
50%,15314.5,96.0,3.0,867.74
75%,16797.75,380.0,7.0,2248.305
max,18287.0,739.0,398.0,580987.04


In [23]:
rfm_df.shape,rfm_df.size,rfm_df.columns

((5878, 4),
 23512,
 Index(['Customer ID', 'Recency', 'Frequency', 'Monetary'], dtype='object'))

In [24]:
# Cell 8: Save RFM Features
# Save the final RFM DataFrame for future use in model training.
output_data_folder = '../data' # Relative path from notebooks to data folder
output_file_name = 'rfm_features.xlsx'
output_file_path = os.path.join(output_data_folder, output_file_name)

print(f"Attempting to save RFM features to: {output_file_path}")

rfm_df.to_excel(output_file_path, index=False)

print(f"RFM features successfully saved to '{output_file_path}'.")

Attempting to save RFM features to: ../data\rfm_features.xlsx
RFM features successfully saved to '../data\rfm_features.xlsx'.
