In [5]:
# 1. Load the RFM table and prepare
import pandas as pd
import numpy as np

# Load the customer-level RFM table you created in Phase 2
rfm_path = "../data_clean/customer_rfm.csv"
rfm_df = pd.read_csv(rfm_path, parse_dates=['first_purchase_date', 'last_purchase_date', 'analysis_date'])

print(f"Loaded {rfm_df.shape[0]:,} customers")
rfm_df.head()

Loaded 4,337 customers


Unnamed: 0,CustomerID,recency_days,frequency,monetary,customer_lifespan_years,first_purchase_date,last_purchase_date,analysis_date
0,14646,1,73,280206.02,0.967123,2010-12-20 10:09:00,2011-12-08 12:12:00,2011-12-09 12:50:00
1,18102,0,60,259657.3,1.005479,2010-12-07 16:42:00,2011-12-09 11:50:00,2011-12-09 12:50:00
2,17450,8,46,187406.07,0.983562,2010-12-07 09:23:00,2011-12-01 13:29:00,2011-12-09 12:50:00
3,14911,1,201,143825.06,1.019178,2010-12-01 14:05:00,2011-12-08 15:54:00,2011-12-09 12:50:00
4,12415,24,21,124914.53,0.857534,2011-01-06 11:12:00,2011-11-15 14:22:00,2011-12-09 12:50:00


In [6]:
# Cell 2: Safe quintile function (handles duplicates gracefully)
def safe_qcut(series, q=5, labels=None):
    """
    Wrapper around pd.qcut that handles duplicate edges by dropping them.
    Falls back to integer labels if needed.
    """
    try:
        return pd.qcut(series, q=q, labels=labels, duplicates='drop')
    except ValueError:
        # If still too many duplicates, use rank-based binning
        ranks = series.rank(method='min')
        bins = np.linspace(0, ranks.max(), q + 1)
        return pd.cut(ranks, bins=bins, labels=labels, include_lowest=True)

# Test it quickly
print("Testing on frequency:")
print(rfm_df['frequency'].value_counts().head(10))  # Many customers have frequency=1 → causes duplicates

Testing on frequency:
frequency
1     1493
2      835
3      507
4      388
5      242
6      172
7      143
8       98
9       68
10      54
Name: count, dtype: int64


In [7]:
# Cell 3: Apply quintiles with reverse scoring for Recency
# Recency: lower days = better → reverse labels (5 = most recent)
rfm_df['R_quintile'] = safe_qcut(rfm_df['recency_days'], q=5, labels=[5, 4, 3, 2, 1])

# Frequency: higher = better
rfm_df['F_quintile'] = safe_qcut(rfm_df['frequency'], q=5, labels=[1, 2, 3, 4, 5])

# Monetary: higher = better
rfm_df['M_quintile'] = safe_qcut(rfm_df['monetary'], q=5, labels=[1, 2, 3, 4, 5])

# Convert to string for concatenation
rfm_df['RFM_Score'] = (
    rfm_df['R_quintile'].astype(int).astype(str) +
    rfm_df['F_quintile'].astype(int).astype(str) +
    rfm_df['M_quintile'].astype(int).astype(str)
)

print("Quintile distribution:")
print(rfm_df[['R_quintile', 'F_quintile', 'M_quintile']].apply(pd.Series.value_counts).sort_index())
print("\nSample RFM scores:")
rfm_df[['CustomerID', 'recency_days', 'frequency', 'monetary', 'RFM_Score']].head(10)

Quintile distribution:
   R_quintile  F_quintile  M_quintile
1         861        1493         868
2         866         835         867
3         863         507         867
4         840         802         867
5         907         700         868

Sample RFM scores:


Unnamed: 0,CustomerID,recency_days,frequency,monetary,RFM_Score
0,14646,1,73,280206.02,555
1,18102,0,60,259657.3,555
2,17450,8,46,187406.07,555
3,14911,1,201,143825.06,555
4,12415,24,21,124914.53,455
5,14156,9,55,117379.63,555
6,17511,2,31,91062.38,555
7,16029,38,62,72882.09,355
8,16684,4,28,66653.56,555
9,14096,4,17,65164.79,555


In [8]:
# Cell 4: Assign human-readable segments
def assign_rfm_segment(rfm_score):
    score = int(rfm_score)  # e.g., '555' → 555
    r = int(rfm_score[0])
    f = int(rfm_score[1])
    m = int(rfm_score[2])
    
    if score >= 555:
        return "Champions"
    elif r >= 4 and f >= 4:
        return "Loyal Customers"
    elif r >= 3 and f >= 3:
        return "Potential Loyalists"
    elif r >= 4 and f <= 3:
        return "New Customers"
    elif r >= 2 and r <= 3 and f >= 2:
        return "At Risk"
    elif r <= 2 and f >= 3:
        return "Can't Lose Them"
    elif r <= 3 and f <= 2:
        return "Hibernating"
    elif r <= 2 and f <= 2:
        return "Lost"
    else:
        return "Other"

rfm_df['Segment'] = rfm_df['RFM_Score'].apply(assign_rfm_segment)

print("Segment distribution:")
print(rfm_df['Segment'].value_counts())

Segment distribution:
Segment
Hibernating            1414
At Risk                 706
Loyal Customers         699
Potential Loyalists     590
New Customers           514
Champions               329
Can't Lose Them          85
Name: count, dtype: int64


In [9]:
# Cell 5: Calculate at-risk revenue (your headline metric)
# Define vulnerable segments
at_risk_segments = ['At Risk', 'Hibernating', 'Lost', "Can't Lose Them"]

at_risk_df = rfm_df[rfm_df['Segment'].isin(at_risk_segments)]

total_at_risk_revenue = at_risk_df['monetary'].sum()

print(f"Total customers: {len(rfm_df):,}")
print(f"At-risk customers: {len(at_risk_df):,}")
print(f"Revenue at risk: £{total_at_risk_revenue:,.2f}")

Total customers: 4,337
At-risk customers: 2,205
Revenue at risk: £1,463,243.24


In [10]:
# Cell 6: Save the final segmented table
final_path = "../data_clean/customer_rfm_segmented.csv"
rfm_df.to_csv(final_path, index=False)
print(f"Final RFM segmented table saved to {final_path}")

Final RFM segmented table saved to ../data_clean/customer_rfm_segmented.csv
