In [None]:
# %% [markdown]
# # Customer Segmentation Project
# ## Notebook 02: RFM Analysis
#
# This notebook calculates RFM (Recency, Frequency, Monetary) metrics for customer segmentation.

In [None]:
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import project modules
import sys
sys.path.append('../src')

from utils import load_data, clean_data, save_plot
from rfm import (
    calculate_rfm,
    score_rfm,
    plot_rfm_distributions,
    segment_rfm_by_score
)

In [None]:
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
plt.style.use('seaborn-v0_8-darkgrid')

In [None]:
# %% [markdown]
# ### 1. Load and Prepare Data

In [None]:
# %%
df = load_data('../raw_data.csv')
df_clean = clean_data(df)

In [None]:
print("=== Transaction Data Summary ===")
print(f"Time period: {df_clean['transaction_date'].min().date()} to {df_clean['transaction_date'].max().date()}")
print(f"Total customers: {df_clean['customer_id'].nunique()}")
print(f"Total transactions: {len(df_clean)}")
print(f"Total revenue: ${df_clean['order_value'].sum():,.2f}")

In [None]:
# %% [markdown]
# ### 2. Calculate RFM Metrics

In [None]:
# %%
rfm_table = calculate_rfm(df_clean)

In [None]:
print("\n=== RFM Metrics Summary ===")
print(rfm_table.describe().round(2))

In [None]:
print("\n=== Sample RFM Data ===")
display(rfm_table.head(10))

In [None]:
# %% [markdown]
# ### 3. Analyze RFM Distributions

In [None]:
# %%
fig = plot_rfm_distributions(rfm_table)
plt.show()

In [None]:
print("\n=== RFM Correlation Matrix ===")
correlation_matrix = rfm_table[['recency', 'frequency', 'monetary']].corr()
display(correlation_matrix)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap='coolwarm',
    center=0,
    square=True,
    linewidths=1,
    cbar_kws={"shrink": 0.8}
)
plt.title('RFM Metrics Correlation', fontsize=14, fontweight='bold')
plt.tight_layout()
save_plot(plt.gcf(), 'rfm_correlation.png')
plt.show()

In [None]:
# %% [markdown]
# ### 4. Score RFM Metrics

In [None]:
# %%
rfm_scored = score_rfm(rfm_table)

In [None]:
print("\n=== RFM Scoring Summary ===")
print("Score ranges: 1 (worst) to 5 (best)")
print("\nScore distributions:")
print(f"Recency scores: {rfm_scored['R_score'].value_counts().sort_index().to_dict()}")
print(f"Frequency scores: {rfm_scored['F_score'].value_counts().sort_index().to_dict()}")
print(f"Monetary scores: {rfm_scored['M_score'].value_counts().sort_index().to_dict()}")

In [None]:
print("\n=== Sample Scored RFM Data ===")
display(rfm_scored.head(10))

In [None]:
# %% [markdown]
# ### 5. Create Business Segments from RFM Scores

In [None]:
# %%
rfm_segmented = segment_rfm_by_score(rfm_scored)

In [None]:
print("\n=== RFM Segment Distribution ===")
segment_counts = rfm_segmented['rfm_segment'].value_counts()
print(segment_counts)

In [None]:
plt.figure(figsize=(12, 6))

ax1 = plt.subplot(121)
segment_counts.plot(kind='bar', color='skyblue', edgecolor='black', ax=ax1)
ax1.set_title('Customer Segments (RFM Total)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Segment')
ax1.set_ylabel('Number of Customers')
ax1.tick_params(axis='x', rotation=45)

total = segment_counts.sum()
for i, v in enumerate(segment_counts):
    ax1.text(i, v + 0.5, f'{v}\n({v/total*100:.1f}%)',
             ha='center', va='bottom', fontsize=9)

In [None]:
ax2 = plt.subplot(122)
detailed_counts = rfm_segmented['detailed_segment'].value_counts()
detailed_counts.plot(kind='bar', color='lightgreen', edgecolor='black', ax=ax2)
ax2.set_title('Customer Segments (Detailed)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Segment')
ax2.set_ylabel('Number of Customers')
ax2.tick_params(axis='x', rotation=45)

for i, v in enumerate(detailed_counts):
    ax2.text(i, v + 0.5, f'{v}\n({v/total*100:.1f}%)',
             ha='center', va='bottom', fontsize=9)

plt.tight_layout()
save_plot(plt.gcf(), 'segment_distribution.png')
plt.show()

In [None]:
# %% [markdown]
# ### 6. Analyze Segment Characteristics

In [None]:
# %%
print("\n=== Segment RFM Characteristics ===")

segment_analysis = rfm_segmented.groupby('rfm_segment').agg({
    'recency': 'mean',
    'frequency': 'mean',
    'monetary': 'mean',
    'customer_id': 'count'
}).round(2)

In [None]:
segment_analysis = segment_analysis.rename(columns={'customer_id': 'customer_count'})
segment_analysis['percentage'] = (
    segment_analysis['customer_count'] /
    segment_analysis['customer_count'].sum() * 100
).round(1)

segment_analysis = segment_analysis.sort_values('monetary', ascending=False)
display(segment_analysis)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

In [None]:
axes[0, 0].barh(segment_analysis.index, segment_analysis['recency'], color='skyblue')
axes[0, 0].set_xlabel('Average Recency (days)')
axes[0, 0].set_title('Average Recency by Segment', fontweight='bold')
axes[0, 0].invert_yaxis()

In [None]:
axes[0, 1].barh(segment_analysis.index, segment_analysis['frequency'], color='lightgreen')
axes[0, 1].set_xlabel('Average Frequency')
axes[0, 1].set_title('Average Frequency by Segment', fontweight='bold')
axes[0, 1].invert_yaxis()

In [None]:
axes[1, 0].barh(segment_analysis.index, segment_analysis['monetary'], color='salmon')
axes[1, 0].set_xlabel('Average Monetary ($)')
axes[1, 0].set_title('Average Spend by Segment', fontweight='bold')
axes[1, 0].invert_yaxis()

In [None]:
axes[1, 1].barh(segment_analysis.index, segment_analysis['customer_count'], color='gold')
axes[1, 1].set_xlabel('Number of Customers')
axes[1, 1].set_title('Customer Count by Segment', fontweight='bold')
axes[1, 1].invert_yaxis()

plt.tight_layout()
save_plot(fig, 'segment_characteristics.png')
plt.show()

In [None]:
# %% [markdown]
# ### 7. Export RFM Table

In [None]:
# %%
rfm_segmented.to_csv('../data/processed/rfm_table.csv', index=False)

In [None]:
print("\n=== RFM Table Exported ===")
print("File saved: ../data/processed/rfm_table.csv")
print(f"Rows: {rfm_segmented.shape[0]}, Columns: {rfm_segmented.shape[1]}")

In [None]:
print("\n=== Final RFM Table Columns ===")
print(rfm_segmented.columns.tolist())

print("\n=== Final RFM Table Sample ===")
display(rfm_segmented.head())