In [None]:
# %% [markdown]
# # Customer Segmentation Project
# ## Notebook 04: Behavioral Insights & Recommendations
#
# This notebook analyzes customer segments and provides business recommendations.

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import project modules
import sys
sys.path.append('../src')
from utils import save_plot

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
plt.style.use('seaborn-v0_8-darkgrid')

In [None]:
# %% [markdown]
# ### 1. Load Segmented Customer Data

# %%
customer_segments = pd.read_csv('../data/processed/customer_segments.csv')

print("=== Customer Segments Data Loaded ===")
print(f"Shape: {customer_segments.shape}")
print(f"
Columns: {customer_segments.columns.tolist()}")
print(f"
First 5 rows:")
display(customer_segments.head())

df_raw = pd.read_csv('../raw_data.csv')
df_raw['transaction_date'] = pd.to_datetime(df_raw['transaction_date'])

df_merged = pd.merge(
    df_raw,
    customer_segments[['customer_id', 'cluster', 'rfm_segment', 'detailed_segment']],
    on='customer_id',
    how='left'
)

print(f"
Merged transaction data shape: {df_merged.shape}")

In [None]:
# %% [markdown]
# ### 2. Analyze Cluster Profiles

# %%
print("=== Detailed Cluster Analysis ===")

cluster_names = {
    0: 'At-Risk Customers',
    1: 'Promising Customers',
    2: 'Loyal Regulars',
    3: 'High-Value Champions',
    4: 'Seasonal Shoppers'
}

customer_segments['cluster_name'] = customer_segments['cluster'].map(cluster_names)

cluster_profiles = customer_segments.groupby('cluster_name').agg({
    'recency': ['mean', 'std'],
    'frequency': ['mean', 'std'],
    'monetary': ['mean', 'std'],
    'customer_id': 'count',
    'R_score': 'mean',
    'F_score': 'mean',
    'M_score': 'mean'
}).round(2)

cluster_profiles.columns = ['_'.join(col).strip() for col in cluster_profiles.columns.values]
cluster_profiles = cluster_profiles.rename(columns={
    'customer_id_count': 'customer_count',
    'R_score_mean': 'avg_R_score',
    'F_score_mean': 'avg_F_score',
    'M_score_mean': 'avg_M_score'
})

cluster_profiles['percentage'] = (
    cluster_profiles['customer_count'] /
    cluster_profiles['customer_count'].sum() * 100
).round(1)

cluster_profiles = cluster_profiles[[
    'customer_count', 'percentage',
    'recency_mean', 'recency_std',
    'frequency_mean', 'frequency_std',
    'monetary_mean', 'monetary_std',
    'avg_R_score', 'avg_F_score', 'avg_M_score'
]]

display(cluster_profiles)

In [None]:
# %% [markdown]
# ### 3. Visualize Cluster Comparison

# %%
fig = plt.figure(figsize=(14, 10))

categories = ['Recency
(Lower Better)', 'Frequency
(Higher Better)', 'Monetary
(Higher Better)']
N = len(categories)

normalized_data = cluster_profiles.copy()
for metric in ['recency_mean', 'frequency_mean', 'monetary_mean']:
    if metric == 'recency_mean':
        normalized_data[metric] = 1 - (normalized_data[metric] / normalized_data[metric].max())
    else:
        normalized_data[metric] = normalized_data[metric] / normalized_data[metric].max()

angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]

for idx, (cluster, row) in enumerate(normalized_data.iterrows()):
    values = [row['recency_mean'], row['frequency_mean'], row['monetary_mean']]
    values += values[:1]

    ax = plt.subplot(2, 3, idx + 1, polar=True)
    ax.plot(angles, values, linewidth=2, linestyle='solid', label=cluster)
    ax.fill(angles, values, alpha=0.25)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories)
    ax.set_ylim(0, 1)
    ax.set_title(
        f'{cluster}
({cluster_profiles.loc[cluster, "percentage"]}% of customers)',
        fontsize=11,
        fontweight='bold'
    )
    ax.grid(True)

plt.tight_layout()
save_plot(fig, 'cluster_radar_chart.png')
plt.show()

In [None]:
# %% [markdown]
# ### 4. Analyze Purchase Behavior by Cluster

# %%
print("=== Purchase Behavior Analysis by Cluster ===")

cluster_behavior = df_merged.groupby('cluster_name').agg({
    'order_value': ['mean', 'sum', 'count'],
    'quantity': 'mean',
    'product_category': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'N/A',
    'channel': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'N/A'
}).round(2)

cluster_behavior.columns = [
    'avg_order_value', 'total_revenue', 'transaction_count',
    'avg_quantity', 'top_category', 'top_channel'
]

cluster_behavior['revenue_per_customer'] = (
    cluster_behavior['total_revenue'] /
    cluster_profiles['customer_count']
).round(2)

cluster_behavior['transactions_per_customer'] = (
    cluster_behavior['transaction_count'] /
    cluster_profiles['customer_count']
).round(2)

display(cluster_behavior)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].barh(cluster_behavior.index, cluster_behavior['total_revenue'],
                color='skyblue', edgecolor='black')
axes[0, 0].set_xlabel('Total Revenue ($)')
axes[0, 0].set_title('Revenue Contribution by Cluster', fontweight='bold')
axes[0, 0].invert_yaxis()

axes[0, 1].barh(cluster_behavior.index, cluster_behavior['avg_order_value'],
                color='lightgreen', edgecolor='black')
axes[0, 1].set_xlabel('Average Order Value ($)')
axes[0, 1].set_title('Average Order Value by Cluster', fontweight='bold')
axes[0, 1].invert_yaxis()

axes[1, 0].barh(cluster_behavior.index, cluster_behavior['revenue_per_customer'],
                color='salmon', edgecolor='black')
axes[1, 0].set_xlabel('Revenue per Customer ($)')
axes[1, 0].setTitle('Customer Lifetime Value Indicator', fontweight='bold')
axes[1, 0].invert_yaxis()

axes[1, 1].barh(cluster_behavior.index, cluster_behavior['transactions_per_customer'],
                color='gold', edgecolor='black')
axes[1, 1].set_xlabel('Transactions per Customer')
axes[1, 1].set_title('Engagement Level by Cluster', fontweight='bold')
axes[1, 1].invert_yaxis()

plt.tight_layout()
save_plot(fig, 'cluster_performance_metrics.png')
plt.show()

In [None]:
# %% [markdown]
# ### 5. Product & Channel Preferences

# %%
print("=== Product Category Preferences by Cluster ===")

category_by_cluster = pd.crosstab(
    df_merged['cluster_name'],
    df_merged['product_category'],
    normalize='index'
) * 100

plt.figure(figsize=(12, 8))
sns.heatmap(
    category_by_cluster, annot=True, fmt='.1f', cmap='YlOrBr',
    cbar_kws={'label': 'Percentage (%)'}
)
plt.title('Product Category Preferences by Customer Cluster',
          fontsize=14, fontweight='bold')
plt.xlabel('Product Category')
plt.ylabel('Customer Cluster')
plt.tight_layout()
save_plot(plt.gcf(), 'product_preferences_heatmap.png')
plt.show()

print("
=== Channel Preferences by Cluster ===")
channel_by_cluster = pd.crosstab(
    df_merged['cluster_name'],
    df_merged['channel'],
    normalize='index'
) * 100

plt.figure(figsize=(10, 6))
channel_by_cluster.plot(kind='bar', stacked=True, colormap='Set2', edgecolor='black')
plt.title('Channel Preferences by Customer Cluster', fontsize=14, fontweight='bold')
plt.xlabel('Customer Cluster')
plt.ylabel('Percentage (%)')
plt.legend(title='Channel', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
save_plot(plt.gcf(), 'channel_preferences.png')
plt.show()

In [None]:
# %% [markdown]
# ### 6. Business Recommendations by Segment

# %%
print("=== SEGMENT-SPECIFIC BUSINESS RECOMMENDATIONS ===
")

recommendations = {
    'High-Value Champions': {
        'characteristics': 'High recency, frequency, and monetary scores. Most valuable customers.',
        'recommendations': [
            'Offer exclusive VIP programs with early access to new products',
            'Provide personalized shopping experiences and dedicated account managers',
            'Implement a loyalty program with tiered rewards',
            'Send personalized thank you notes and gifts on special occasions',
            'Invite to exclusive events and preview sales'
        ],
        'retention_strategy': 'Focus on retention through exceptional service',
        'marketing_budget': 'Allocate 40% of marketing budget',
        'expected_roi': 'High (5-10x return)'
    },
    'Loyal Regulars': {
        'characteristics': 'Moderate to high frequency, consistent purchasers but lower average spend.',
        'recommendations': [
            'Encourage higher basket size with volume discounts',
            'Cross-sell complementary products',
            'Offer bundle deals to increase average order value',
            'Implement a points-based loyalty program',
            'Send re-engagement emails for abandoned carts'
        ],
        'retention_strategy': 'Increase customer lifetime value through upsells',
        'marketing_budget': 'Allocate 25% of marketing budget',
        'expected_roi': 'Medium (3-5x return)'
    },
    'Promising Customers': {
        'characteristics': 'Recent purchasers with moderate frequency and spend.',
        'recommendations': [
            'Nurture with welcome series and onboarding emails',
            'Offer first-time buyer discounts on next purchase',
            'Send product recommendations based on purchase history',
            'Encourage social media engagement and reviews',
            'Provide educational content about product usage'
        ],
        'retention_strategy': 'Convert to loyal customers through engagement',
        'marketing_budget': 'Allocate 20% of marketing budget',
        'expected_roi': 'Medium-High (4-7x return)'
    },
    'Seasonal Shoppers': {
        'characteristics': 'Purchase primarily during specific seasons or promotions.',
        'recommendations': [
            'Create targeted campaigns before peak seasons',
            'Offer seasonal bundles and limited-time promotions',
            'Send reminder emails before expected purchase periods',
            'Implement cart abandonment recovery for seasonal items',
            'Create urgency with countdown timers for seasonal sales'
        ],
        'retention_strategy': 'Extend purchase cycle beyond seasons',
        'marketing_budget': 'Allocate 10% of marketing budget',
        'expected_roi': 'Medium (2-4x return)'
    },
    'At-Risk Customers': {
        'characteristics': "Low recency (haven't purchased recently), potentially churning.",
        'recommendations': [
            'Send win-back campaigns with special offers',
            'Conduct surveys to understand why they stopped purchasing',
            'Offer reactivation discounts',
            'Highlight new products or features they might have missed',
            'Implement a "We miss you" email series'
        ],
        'retention_strategy': 'Reactivate before complete churn',
        'marketing_budget': 'Allocate 5% of marketing budget',
        'expected_roi': 'Low-Medium (1-3x return)'
    }
}

for segment, info in recommendations.items():
    print(f"
{'='*60}")
    print(f"SEGMENT: {segment}")
    print(f"{'='*60}")
    print(f"
Characteristics: {info['characteristics']}")
    print(f"
Key Recommendations:")
    for i, rec in enumerate(info['recommendations'], 1):
        print(f"  {i}. {rec}")
    print(f"
Retention Strategy: {info['retention_strategy']}")
    print(f"Marketing Budget Allocation: {info['marketing_budget']}")
    print(f"Expected ROI: {info['expected_roi']}")

In [None]:
# %% [markdown]
# ### 7. Action Plan Summary

# %%
print("
" + "="*70)
print("EXECUTIVE SUMMARY & ACTION PLAN")
print("="*70)

summary_data = []
for segment in cluster_profiles.index:
    segment_data = {
        'Segment': segment,
        'Customers': int(cluster_profiles.loc[segment, 'customer_count']),
        '% of Total': f"{cluster_profiles.loc[segment, 'percentage']}%",
        'Avg Recency': f"{cluster_profiles.loc[segment, 'recency_mean']:.0f} days",
        'Avg Frequency': f"{cluster_profiles.loc[segment, 'frequency_mean']:.1f}",
        'Avg Spend': f"${cluster_profiles.loc[segment, 'monetary_mean']:.0f}",
        'Priority': ['High', 'High', 'Medium', 'Medium', 'Low'][list(cluster_profiles.index).index(segment)],
        'Focus': ['Retain', 'Upsell', 'Nurture', 'Seasonal', 'Reactivate'][list(cluster_profiles.index).index(segment)]
    }
    summary_data.append(segment_data)

summary_df = pd.DataFrame(summary_data)
display(summary_df)

plt.figure(figsize=(12, 8))

scatter = plt.scatter(
    cluster_profiles['frequency_mean'],
    cluster_profiles['monetary_mean'],
    s=cluster_profiles['customer_count'] * 10,
    c=range(len(cluster_profiles)),
    cmap='viridis',
    alpha=0.7,
    edgecolors='black',
    linewidth=1.5
)

for i, segment in enumerate(cluster_profiles.index):
    plt.annotate(
        segment,
        (cluster_profiles['frequency_mean'].iloc[i], cluster_profiles['monetary_mean'].iloc[i]),
        xytext=(5, 5),
        textcoords='offset points',
        fontsize=10,
        fontweight='bold'
    )

plt.xlabel('Average Frequency (Number of Purchases)', fontsize=12)
plt.ylabel('Average Monetary Value ($)', fontsize=12)
plt.title('Customer Value Matrix: Frequency vs Monetary Value', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)

plt.axhline(y=cluster_profiles['monetary_mean'].median(), color='gray', linestyle='--', alpha=0.5)
plt.axvline(x=cluster_profiles['frequency_mean'].median(), color='gray', linestyle='--', alpha=0.5)

plt.text(plt.xlim()[1]*0.7, plt.ylim()[1]*0.9, 'High Value
Champions',
         fontsize=11, fontweight='bold', ha='center')
plt.text(plt.xlim()[1]*0.7, plt.ylim()[0]*1.1, 'Loyal
Regulars',
         fontsize=11, fontweight='bold', ha='center')
plt.text(plt.xlim()[0]*1.1, plt.ylim()[1]*0.9, 'Big
Spenders',
         fontsize=11, fontweight='bold', ha='center')
plt.text(plt.xlim()[0]*1.1, plt.ylim()[0]*1.1, 'At Risk
Customers',
         fontsize=11, fontweight='bold', ha='center')

plt.tight_layout()
save_plot(plt.gcf(), 'customer_value_matrix.png')
plt.show()

In [None]:
# %% [markdown]
# ### 8. Export Final Analysis

# %%
final_segments = customer_segments.copy()

final_segments['customer_value_tier'] = pd.qcut(
    final_segments['monetary'],
    q=3,
    labels=['Bronze', 'Silver', 'Gold']
)

final_segments['clv_proxy'] = final_segments['frequency'] * final_segments['monetary']

final_segments.to_csv('../data/processed/customer_segments_final.csv', index=False)

print(f"
=== Final Analysis Complete ===")
print(f"Total customers analyzed: {len(final_segments)}")
print(f"Segments identified: {len(cluster_profiles)}")
print(f"Final data exported to: ../data/processed/customer_segments_final.csv")

print("
=== KEY INSIGHTS SUMMARY ===")
print(f"1. High-Value Champions ({cluster_profiles.loc['High-Value Champions', 'percentage']}% of customers)")
print(f"   • Generate ${cluster_behavior.loc['High-Value Champions', 'total_revenue']:,.0f} in revenue")
print(f"   • Account for {cluster_behavior.loc['High-Value Champions', 'total_revenue']/cluster_behavior['total_revenue'].sum()*100:.1f}% of total revenue")
print(f"   • Average CLV: ${cluster_behavior.loc['High-Value Champions', 'revenue_per_customer']:.0f}")

print(f"
2. Top Recommendation:")
print("   • Focus retention efforts on High-Value Champions (40% of marketing budget)")
print("   • Implement VIP program with exclusive benefits")
print("   • Expected ROI: 5-10x")

print(f"
3. Growth Opportunity:")
print(f"   • Convert {cluster_profiles.loc['Promising Customers', 'customer_count']} Promising Customers")
print(f"   • Potential revenue increase: ${cluster_profiles.loc['Promising Customers', 'customer_count'] * cluster_behavior.loc['High-Value Champions', 'revenue_per_customer'] * 0.5:,.0f}")