In [2]:
import pandas as pd

# Load segmented RFM table
rfm_df = pd.read_csv("../data_clean/customer_rfm_segmented.csv", 
                     parse_dates=['first_purchase_date', 'last_purchase_date', 'analysis_date'])

# Load cleaned transactions to get Country
trans_df = pd.read_csv("../data_clean/online_retail_clean.csv")

# Get primary country per customer (mode — most common country they shipped to)
customer_country = trans_df.groupby('CustomerID')['Country'].agg(lambda x: x.mode()[0] if not x.mode().empty else 'Unknown').reset_index()

# Merge with RFM table
df_full = rfm_df.merge(customer_country, on='CustomerID', how='left')

print(f"Full table shape: {df_full.shape}")
df_full.head()

Full table shape: (4337, 14)


Unnamed: 0,CustomerID,recency_days,frequency,monetary,customer_lifespan_years,first_purchase_date,last_purchase_date,analysis_date,R_quintile,F_quintile,M_quintile,RFM_Score,Segment,Country
0,14646,1,73,280206.02,0.967123,2010-12-20 10:09:00,2011-12-08 12:12:00,2011-12-09 12:50:00,5,5,5,555,Champions,Netherlands
1,18102,0,60,259657.3,1.005479,2010-12-07 16:42:00,2011-12-09 11:50:00,2011-12-09 12:50:00,5,5,5,555,Champions,United Kingdom
2,17450,8,46,187406.07,0.983562,2010-12-07 09:23:00,2011-12-01 13:29:00,2011-12-09 12:50:00,5,5,5,555,Champions,United Kingdom
3,14911,1,201,143825.06,1.019178,2010-12-01 14:05:00,2011-12-08 15:54:00,2011-12-09 12:50:00,5,5,5,555,Champions,EIRE
4,12415,24,21,124914.53,0.857534,2011-01-06 11:12:00,2011-11-15 14:22:00,2011-12-09 12:50:00,4,5,5,455,Loyal Customers,Australia


In [3]:
# Key metrics
total_customers = len(df_full)
total_revenue = df_full['monetary'].sum()

champions = df_full[df_full['Segment'] == 'Champions']
champions_revenue_share = (champions['monetary'].sum() / total_revenue) * 100 if total_revenue > 0 else 0

at_risk_segments = ['At Risk', 'Hibernating', 'Lost', "Can't Lose Them"]
at_risk = df_full[df_full['Segment'].isin(at_risk_segments)]
at_risk_revenue = at_risk['monetary'].sum()

print(f"Total customers: {total_customers:,}")
print(f"Total revenue: £{total_revenue:,.0f}")
print(f"Champions revenue share: {champions_revenue_share:.1f}%")
print(f"Revenue at risk: £{at_risk_revenue:,.0f}")

# Country insights
country_rev = df_full.groupby('Country')['monetary'].sum().sort_values(ascending=False)
country_customers = df_full.groupby('Country')['CustomerID'].nunique()

print("\nTop 10 countries by revenue (£M):")
print((country_rev.head(10) / 1_000_000).round(2))

print("\nTop non-UK countries by average customer value:")
non_uk = df_full[df_full['Country'] != 'United Kingdom']
avg_value = (non_uk.groupby('Country')['monetary'].mean().sort_values(ascending=False))
print(avg_value.head(10).round(0))

Total customers: 4,337
Total revenue: £8,598,418
Champions revenue share: 44.9%
Revenue at risk: £1,463,243

Top 10 countries by revenue (£M):
Country
United Kingdom    7.00
Netherlands       0.29
EIRE              0.27
Germany           0.23
France            0.21
Australia         0.14
Spain             0.06
Switzerland       0.06
Belgium           0.04
Sweden            0.04
Name: monetary, dtype: float64

Top non-UK countries by average customer value:
Country
EIRE           88515.0
Netherlands    31716.0
Singapore      21279.0
Australia      15546.0
Sweden          4797.0
Japan           4677.0
Iceland         4310.0
Norway          3617.0
Switzerland     2821.0
Germany         2435.0
Name: monetary, dtype: float64
