In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/cleaned_retail_data.csv")

In [2]:
df.head()
df.shape

(182064, 31)

In [3]:
df['Date'] = pd.to_datetime(df['Date'])

snapshot_date = df['Date'].max() + pd.Timedelta(days=1)
snapshot_date

Timestamp('2024-03-01 00:00:00')

In [4]:
rfm = df.groupby('Customer_ID').agg({
    'Date': lambda x: (snapshot_date - x.max()).days,
    'Transaction_ID': 'count',
    'Revenue': 'sum'
}).reset_index()

rfm.columns = ['Customer_ID', 'Recency', 'Frequency', 'Monetary']
rfm.head()

Unnamed: 0,Customer_ID,Recency,Frequency,Monetary
0,10000.0,103,4,5007.566359
1,10001.0,105,4,4256.91834
2,10002.0,95,4,3747.140929
3,10003.0,228,1,1890.735873
4,10004.0,31,1,1541.117698


In [5]:
rfm['R_Score'] = pd.qcut(rfm['Recency'], 4, labels=[4,3,2,1])
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), 4, labels=[1,2,3,4])
rfm['M_Score'] = pd.qcut(rfm['Monetary'], 4, labels=[1,2,3,4])

rfm['RFM_Score'] = (
    rfm['R_Score'].astype(str) +
    rfm['F_Score'].astype(str) +
    rfm['M_Score'].astype(str)
)

In [6]:
def rfm_segment(row):
    if row['R_Score'] == '4' and row['F_Score'] == '4':
        return 'Champions'
    elif row['R_Score'] >= '3' and row['F_Score'] >= '3':
        return 'Loyal Customers'
    elif row['R_Score'] == '1':
        return 'At Risk'
    else:
        return 'Needs Attention'

rfm['RFM_Segment'] = rfm.apply(rfm_segment, axis=1)

TypeError: '>=' not supported between instances of 'int' and 'str'

In [7]:
rfm['R_Score'] = rfm['R_Score'].astype(int)
rfm['F_Score'] = rfm['F_Score'].astype(int)
rfm['M_Score'] = rfm['M_Score'].astype(int)

In [8]:
def rfm_segment(row):
    if row['R_Score'] == 4 and row['F_Score'] == 4:
        return 'Champions'
    elif row['R_Score'] >= 3 and row['F_Score'] >= 3:
        return 'Loyal Customers'
    elif row['R_Score'] == 1:
        return 'At Risk'
    else:
        return 'Needs Attention'

In [9]:
rfm['RFM_Segment'] = rfm.apply(rfm_segment, axis=1)

In [10]:
rfm[['R_Score','F_Score','M_Score','RFM_Segment']].head()
rfm['RFM_Segment'].value_counts()

RFM_Segment
Needs Attention    33298
At Risk            19398
Loyal Customers    17093
Champions           7983
Name: count, dtype: int64

In [11]:
rfm['RFM_Segment'] = np.where(
    (rfm['R_Score'] == 4) & (rfm['F_Score'] == 4),
    'Champions',
    np.where(
        (rfm['R_Score'] >= 3) & (rfm['F_Score'] >= 3),
        'Loyal Customers',
        np.where(
            rfm['R_Score'] == 1,
            'At Risk',
            'Needs Attention'
        )
    )
)

In [15]:
# Fix duplicate bin edges by using rank() before qcut
rfm['R_Score'] = pd.qcut(rfm['Recency'], 4, labels=[4,3,2,1])
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), 4, labels=[1,2,3,4])
rfm['M_Score'] = pd.qcut(rfm['Monetary'].rank(method='first'), 4, labels=[1,2,3,4])

# Convert to int for cleaner comparisons
rfm[['R_Score','F_Score','M_Score']] = rfm[['R_Score','F_Score','M_Score']].astype(int)

In [16]:
# Apply RFM segmentation function
def rfm_segment(row):
    if row['R_Score'] == 4 and row['F_Score'] == 4:
        return 'Champions'
    elif row['R_Score'] >= 3 and row['F_Score'] >= 3:
        return 'Loyal Customers'
    elif row['R_Score'] == 1:
        return 'At Risk'
    else:
        return 'Needs Attention'

rfm['RFM_Segment'] = rfm.apply(rfm_segment, axis=1)

# Check distribution
print("RFM Segment Distribution:")
print(rfm['RFM_Segment'].value_counts())
print(f"\nTotal Customers: {len(rfm)}")

RFM Segment Distribution:
RFM_Segment
Needs Attention    33298
At Risk            19398
Loyal Customers    17093
Champions           7983
Name: count, dtype: int64

Total Customers: 77772


## Step 5: Compare RFM Segments with Business Segments

Compare behavioral RFM segments with marketing-defined customer segments to validate alignment.

In [17]:
# Merge customer segment from original data
customer_segment = df[['Customer_ID','Customer_Segment']].drop_duplicates()

rfm = rfm.merge(customer_segment, on='Customer_ID', how='left')

# Compare business segments vs RFM segments
comparison = pd.crosstab(rfm['Customer_Segment'], rfm['RFM_Segment'], margins=True)
print("Business Segment vs RFM Segment Comparison:")
print(comparison)

Business Segment vs RFM Segment Comparison:
RFM_Segment       At Risk  Champions  Loyal Customers  Needs Attention     All
Customer_Segment                                                              
New                  7845       6146            11210            15774   40975
Premium              5767       5013             8771            11785   31336
Regular             11798       7381            14556            22355   56090
All                 25410      18540            34537            49914  128401


## Step 6: Identify Churn-Risk Customers

Define churn risk based on recency threshold (90 days = no purchase in 3 months).

In [18]:
# Define churn threshold (90 days)
churn_threshold = 90

rfm['Churn_Risk'] = np.where(
    rfm['Recency'] > churn_threshold,
    'High Risk',
    'Active'
)

# View churn-risk customers
churn_customers = rfm[rfm['Churn_Risk'] == 'High Risk']

print(f"Churn Risk Summary:")
print(rfm['Churn_Risk'].value_counts())
print(f"\nHigh-Risk Customers: {len(churn_customers)}")
print(f"Percentage of Total: {len(churn_customers)/len(rfm)*100:.2f}%")
print("\nSample Churn-Risk Customers:")
churn_customers.head(10)

Churn Risk Summary:
Churn_Risk
Active       65117
High Risk    63284
Name: count, dtype: int64

High-Risk Customers: 63284
Percentage of Total: 49.29%

Sample Churn-Risk Customers:


Unnamed: 0,Customer_ID,Recency,Frequency,Monetary,R_Score,F_Score,M_Score,RFM_Score,RFM_Segment,Customer_Segment,Churn_Risk
0,10000.0,103,4,5007.566359,2,4,4,244,Needs Attention,Regular,High Risk
1,10000.0,103,4,5007.566359,2,4,4,244,Needs Attention,Premium,High Risk
2,10001.0,105,4,4256.91834,2,4,3,243,Needs Attention,Regular,High Risk
3,10002.0,95,4,3747.140929,3,4,3,343,Loyal Customers,Regular,High Risk
4,10002.0,95,4,3747.140929,3,4,3,343,Loyal Customers,Premium,High Risk
5,10003.0,228,1,1890.735873,1,1,2,112,At Risk,Regular,High Risk
7,10005.0,246,1,3073.147975,1,1,3,113,At Risk,Regular,High Risk
11,10009.0,122,2,285.875648,2,2,1,221,Needs Attention,Regular,High Risk
12,10009.0,122,2,285.875648,2,2,1,221,Needs Attention,New,High Risk
15,10011.0,107,2,859.283275,2,2,1,221,Needs Attention,Regular,High Risk


## Step 7: Quantify Revenue at Risk

Calculate total revenue and percentage at risk from churning customers.

In [19]:
# Calculate revenue at risk
revenue_at_risk = churn_customers['Monetary'].sum()
total_revenue = rfm['Monetary'].sum()
revenue_risk_pct = (revenue_at_risk / total_revenue) * 100

print("ðŸ’° REVENUE AT RISK ANALYSIS")
print("="*50)
print(f"Total Revenue: ${total_revenue:,.2f}")
print(f"Revenue at Risk: ${revenue_at_risk:,.2f}")
print(f"Percentage at Risk: {revenue_risk_pct:.2f}%")
print("\nðŸ”¥ Business Impact:")
print(f"   â†’ {revenue_risk_pct:.1f}% of total revenue is at risk due to customer churn")

ðŸ’° REVENUE AT RISK ANALYSIS
Total Revenue: $477,272,528.79
Revenue at Risk: $197,643,180.24
Percentage at Risk: 41.41%

ðŸ”¥ Business Impact:
   â†’ 41.4% of total revenue is at risk due to customer churn


## Step 8: Save Deliverables

Export RFM segments and churn-risk customers for downstream analysis.

In [20]:
import os

# Create outputs folder if it doesn't exist
os.makedirs('../outputs', exist_ok=True)

# Save RFM segments
rfm.to_csv("../outputs/rfm_segments.csv", index=False)
print("âœ… Saved: outputs/rfm_segments.csv")

# Save churn-risk customers
churn_customers.to_csv("../outputs/churn_risk_customers.csv", index=False)
print("âœ… Saved: outputs/churn_risk_customers.csv")

# Create summary for insights
rfm_summary = rfm.groupby('RFM_Segment').agg({
    'Customer_ID': 'count',
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'sum'
}).round(2)
rfm_summary.columns = ['Customer_Count', 'Avg_Recency', 'Avg_Frequency', 'Total_Revenue']

rfm_summary.to_csv("../data/rfm_summary.csv")
print("âœ… Saved: data/rfm_summary.csv")

print("\nðŸ“Š RFM Summary by Segment:")
print(rfm_summary)

âœ… Saved: outputs/rfm_segments.csv
âœ… Saved: outputs/churn_risk_customers.csv
âœ… Saved: data/rfm_summary.csv

ðŸ“Š RFM Summary by Segment:
                 Customer_Count  Avg_Recency  Avg_Frequency  Total_Revenue
RFM_Segment                                                               
At Risk                   25410       253.88           1.80   6.301211e+07
Champions                 18540        16.99           4.33   1.088785e+08
Loyal Customers           34537        51.95           3.27   1.543354e+08
Needs Attention           49914       103.50           2.21   1.510465e+08
