# Customer Segmentation Analysis

This notebook implements comprehensive customer segmentation using RFM analysis and K-means clustering.

## Objectives
- Perform RFM (Recency, Frequency, Monetary) analysis
- Implement K-means clustering for customer segmentation
- Profile and analyze customer segments
- Generate actionable business recommendations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
import sys
sys.path.append('../src')
from data_processing import DataProcessor
from segmentation import CustomerSegmentation
from visualization import RetailVisualizer

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Loading and Preparation

In [None]:
# Load and prepare data
processor = DataProcessor()
data = processor.generate_sample_data(n_customers=1000, n_transactions=5000)
processed_data = processor.clean_data()

print(f"Data loaded and processed. Shape: {processed_data.shape}")
print(f"Date range: {processed_data['transaction_date'].min()} to {processed_data['transaction_date'].max()}")

## 2. RFM Analysis

In [None]:
# Initialize customer segmentation
segmenter = CustomerSegmentation(processed_data)

# Calculate RFM metrics
rfm_data = segmenter.calculate_rfm(
    customer_col='customer_id',
    date_col='transaction_date',
    amount_col='total_amount'
)

print("RFM Analysis Results:")
print("=" * 30)
print(rfm_data.head(10))
print(f"\nRFM data shape: {rfm_data.shape}")

In [None]:
# RFM descriptive statistics
print("RFM Metrics Summary:")
print("=" * 25)
print(rfm_data[['Recency', 'Frequency', 'Monetary']].describe())

# RFM Score distribution
print("\nRFM Score Distribution:")
print("=" * 25)
print("R-Score distribution:")
print(rfm_data['R_Score'].value_counts().sort_index())
print("\nF-Score distribution:")
print(rfm_data['F_Score'].value_counts().sort_index())
print("\nM-Score distribution:")
print(rfm_data['M_Score'].value_counts().sort_index())

In [None]:
# Visualize RFM distributions
visualizer = RetailVisualizer(processed_data)
visualizer.rfm_visualization(rfm_data)

## 3. RFM-Based Customer Segmentation

In [None]:
# Create RFM segments
rfm_segments = segmenter.create_rfm_segments()

print("RFM Segment Distribution:")
print("=" * 30)
segment_counts = rfm_segments['Segment'].value_counts()
segment_percentages = rfm_segments['Segment'].value_counts(normalize=True) * 100

segment_summary = pd.DataFrame({
    'Count': segment_counts,
    'Percentage': segment_percentages.round(1)
})
print(segment_summary)

# Visualize segment distribution
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
segment_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Customer Segment Distribution')
plt.ylabel('')

plt.subplot(2, 2, 2)
segment_counts.plot(kind='bar')
plt.title('Customer Count by Segment')
plt.xlabel('Segment')
plt.ylabel('Number of Customers')
plt.xticks(rotation=45)

# RFM metrics by segment
segment_metrics = rfm_segments.groupby('Segment')[['Recency', 'Frequency', 'Monetary']].mean()

plt.subplot(2, 2, 3)
segment_metrics.plot(kind='bar')
plt.title('Average RFM Metrics by Segment')
plt.xlabel('Segment')
plt.ylabel('Average Value')
plt.xticks(rotation=45)
plt.legend(title='Metrics')

# Revenue contribution by segment
plt.subplot(2, 2, 4)
segment_revenue = rfm_segments.groupby('Segment')['Monetary'].sum().sort_values(ascending=False)
segment_revenue.plot(kind='bar')
plt.title('Total Revenue by Segment')
plt.xlabel('Segment')
plt.ylabel('Total Revenue ($)')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 4. K-Means Clustering

In [None]:
# Find optimal number of clusters
optimal_k, inertias, silhouette_scores = segmenter.find_optimal_clusters(max_clusters=8)

print(f"Optimal number of clusters: {optimal_k}")
print(f"Best silhouette score: {max(silhouette_scores):.3f}")

In [None]:
# Perform K-means clustering with optimal number of clusters
rfm_clustered, kmeans_model = segmenter.kmeans_segmentation(n_clusters=optimal_k)

print("K-Means Clustering Results:")
print("=" * 30)
cluster_counts = rfm_clustered['Cluster'].value_counts().sort_index()
cluster_percentages = rfm_clustered['Cluster'].value_counts(normalize=True).sort_index() * 100

cluster_summary = pd.DataFrame({
    'Count': cluster_counts,
    'Percentage': cluster_percentages.round(1)
})
print(cluster_summary)

In [None]:
# Visualize K-means clusters
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('K-Means Clustering Analysis', fontsize=16, fontweight='bold')

# Cluster distribution
cluster_counts.plot(kind='pie', ax=axes[0, 0], autopct='%1.1f%%')
axes[0, 0].set_title('Cluster Distribution')
axes[0, 0].set_ylabel('')

# Cluster characteristics
cluster_metrics = rfm_clustered.groupby('Cluster')[['Recency', 'Frequency', 'Monetary']].mean()
cluster_metrics.plot(kind='bar', ax=axes[0, 1])
axes[0, 1].set_title('Average RFM Metrics by Cluster')
axes[0, 1].set_xlabel('Cluster')
axes[0, 1].set_ylabel('Average Value')
axes[0, 1].legend(title='Metrics')

# 3D scatter plot (Frequency vs Monetary, colored by Recency)
scatter = axes[1, 0].scatter(rfm_clustered['Frequency'], rfm_clustered['Monetary'], 
                           c=rfm_clustered['Cluster'], cmap='viridis', alpha=0.6)
axes[1, 0].set_xlabel('Frequency')
axes[1, 0].set_ylabel('Monetary')
axes[1, 0].set_title('Clusters: Frequency vs Monetary')
plt.colorbar(scatter, ax=axes[1, 0], label='Cluster')

# Revenue contribution by cluster
cluster_revenue = rfm_clustered.groupby('Cluster')['Monetary'].sum().sort_values(ascending=False)
cluster_revenue.plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Total Revenue by Cluster')
axes[1, 1].set_xlabel('Cluster')
axes[1, 1].set_ylabel('Total Revenue ($)')

plt.tight_layout()
plt.show()

## 5. Segment Analysis and Profiling

In [None]:
# Comprehensive segment analysis
segment_analysis = segmenter.analyze_segments()

print("RFM SEGMENT ANALYSIS:")
print("=" * 40)
if 'RFM_Segments' in segment_analysis:
    print(segment_analysis['RFM_Segments'])

print("\nK-MEANS CLUSTER ANALYSIS:")
print("=" * 40)
if 'K_Means_Clusters' in segment_analysis:
    print(segment_analysis['K_Means_Clusters'])

In [None]:
# Detailed cluster profiling
print("DETAILED CLUSTER PROFILES:")
print("=" * 35)

for cluster_id in sorted(rfm_clustered['Cluster'].unique()):
    cluster_data = rfm_clustered[rfm_clustered['Cluster'] == cluster_id]
    
    print(f"\n🔸 CLUSTER {cluster_id}:")
    print(f"   Size: {len(cluster_data)} customers ({len(cluster_data)/len(rfm_clustered)*100:.1f}%)")
    print(f"   Avg Recency: {cluster_data['Recency'].mean():.1f} days")
    print(f"   Avg Frequency: {cluster_data['Frequency'].mean():.1f} purchases")
    print(f"   Avg Monetary: ${cluster_data['Monetary'].mean():.2f}")
    print(f"   Total Revenue: ${cluster_data['Monetary'].sum():.2f}")
    print(f"   Revenue Share: {cluster_data['Monetary'].sum()/rfm_clustered['Monetary'].sum()*100:.1f}%")
    
    # Cluster characteristics
    if cluster_data['Recency'].mean() <= 30 and cluster_data['Frequency'].mean() >= 3 and cluster_data['Monetary'].mean() >= 200:
        print(f"   Profile: 🌟 HIGH-VALUE CHAMPIONS")
    elif cluster_data['Recency'].mean() <= 60 and cluster_data['Frequency'].mean() >= 2:
        print(f"   Profile: 💎 LOYAL CUSTOMERS")
    elif cluster_data['Recency'].mean() <= 90 and cluster_data['Frequency'].mean() < 2:
        print(f"   Profile: 🌱 POTENTIAL LOYALISTS")
    elif cluster_data['Recency'].mean() > 90 and cluster_data['Monetary'].mean() >= 150:
        print(f"   Profile: ⚠️ AT RISK")
    else:
        print(f"   Profile: 😴 HIBERNATING/LOST")

## 6. Interactive Visualizations

In [None]:
# Interactive 3D scatter plot
fig = px.scatter_3d(rfm_clustered, 
                    x='Recency', y='Frequency', z='Monetary',
                    color='Cluster',
                    title='3D Customer Segmentation (RFM)',
                    labels={'Recency': 'Recency (days)', 
                           'Frequency': 'Frequency (purchases)',
                           'Monetary': 'Monetary ($)'},
                    hover_data=['customer_id'])

fig.update_layout(scene=dict(
    xaxis_title='Recency (days)',
    yaxis_title='Frequency (purchases)',
    zaxis_title='Monetary ($)'
))

fig.show()

# Interactive segment comparison
if 'Segment' in rfm_clustered.columns:
    segment_comparison = rfm_clustered.groupby('Segment')[['Recency', 'Frequency', 'Monetary']].mean().reset_index()
    
    fig2 = px.bar(segment_comparison, x='Segment', y=['Recency', 'Frequency', 'Monetary'],
                  title='RFM Metrics Comparison by Segment',
                  barmode='group')
    fig2.show()

## 7. Business Recommendations

In [None]:
# Generate business recommendations
recommendations = segmenter.get_segment_recommendations()

print("BUSINESS RECOMMENDATIONS BY SEGMENT:")
print("=" * 50)

for segment, details in recommendations.items():
    if segment in rfm_clustered['Segment'].values:
        segment_size = len(rfm_clustered[rfm_clustered['Segment'] == segment])
        segment_revenue = rfm_clustered[rfm_clustered['Segment'] == segment]['Monetary'].sum()
        
        print(f"\n🎯 {segment.upper()}:")
        print(f"   Size: {segment_size} customers")
        print(f"   Revenue: ${segment_revenue:.2f}")
        print(f"   Description: {details['description']}")
        print(f"   Strategy: {details['strategy']}")
        print(f"   Actions:")
        for action in details['actions']:
            print(f"     • {action}")

In [None]:
# Cluster-specific recommendations
print("\nCLUSTER-SPECIFIC RECOMMENDATIONS:")
print("=" * 45)

for cluster_id in sorted(rfm_clustered['Cluster'].unique()):
    cluster_data = rfm_clustered[rfm_clustered['Cluster'] == cluster_id]
    avg_recency = cluster_data['Recency'].mean()
    avg_frequency = cluster_data['Frequency'].mean()
    avg_monetary = cluster_data['Monetary'].mean()
    
    print(f"\n📊 CLUSTER {cluster_id} STRATEGY:")
    
    if avg_recency <= 30 and avg_frequency >= 3 and avg_monetary >= 200:
        print("   🌟 HIGH-VALUE CHAMPIONS")
        print("   • Provide VIP treatment and exclusive access")
        print("   • Request reviews and referrals")
        print("   • Offer premium products and services")
        print("   • Implement loyalty rewards program")
        
    elif avg_recency <= 60 and avg_frequency >= 2:
        print("   💎 LOYAL CUSTOMERS")
        print("   • Maintain engagement with regular communication")
        print("   • Offer cross-selling opportunities")
        print("   • Provide birthday and anniversary discounts")
        print("   • Create community engagement programs")
        
    elif avg_recency <= 90 and avg_frequency < 2:
        print("   🌱 POTENTIAL LOYALISTS")
        print("   • Increase purchase frequency with incentives")
        print("   • Provide product education and recommendations")
        print("   • Offer free shipping or small discounts")
        print("   • Send targeted email campaigns")
        
    elif avg_recency > 90 and avg_monetary >= 150:
        print("   ⚠️ AT RISK")
        print("   • Immediate re-engagement campaigns")
        print("   • Offer significant discounts or promotions")
        print("   • Conduct customer satisfaction surveys")
        print("   • Personal outreach from customer service")
        
    else:
        print("   😴 HIBERNATING/LOST")
        print("   • Win-back campaigns with deep discounts")
        print("   • Reintroduce brand with new product launches")
        print("   • Consider if retention efforts are cost-effective")
        print("   • Focus on preventing other customers from reaching this stage")

## 8. Segment Performance Metrics

In [None]:
# Calculate key performance metrics by segment
performance_metrics = []

for cluster_id in sorted(rfm_clustered['Cluster'].unique()):
    cluster_data = rfm_clustered[rfm_clustered['Cluster'] == cluster_id]
    
    metrics = {
        'Cluster': cluster_id,
        'Customer_Count': len(cluster_data),
        'Customer_Percentage': len(cluster_data) / len(rfm_clustered) * 100,
        'Total_Revenue': cluster_data['Monetary'].sum(),
        'Revenue_Percentage': cluster_data['Monetary'].sum() / rfm_clustered['Monetary'].sum() * 100,
        'Avg_Customer_Value': cluster_data['Monetary'].mean(),
        'Avg_Recency': cluster_data['Recency'].mean(),
        'Avg_Frequency': cluster_data['Frequency'].mean(),
        'Revenue_per_Customer': cluster_data['Monetary'].sum() / len(cluster_data)
    }
    performance_metrics.append(metrics)

performance_df = pd.DataFrame(performance_metrics)
performance_df = performance_df.round(2)

print("SEGMENT PERFORMANCE METRICS:")
print("=" * 35)
print(performance_df.to_string(index=False))

# Visualize performance metrics
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Segment Performance Dashboard', fontsize=16, fontweight='bold')

# Customer distribution
axes[0, 0].pie(performance_df['Customer_Count'], labels=performance_df['Cluster'], autopct='%1.1f%%')
axes[0, 0].set_title('Customer Distribution by Cluster')

# Revenue distribution
axes[0, 1].pie(performance_df['Total_Revenue'], labels=performance_df['Cluster'], autopct='%1.1f%%')
axes[0, 1].set_title('Revenue Distribution by Cluster')

# Average customer value
axes[1, 0].bar(performance_df['Cluster'], performance_df['Avg_Customer_Value'])
axes[1, 0].set_title('Average Customer Value by Cluster')
axes[1, 0].set_xlabel('Cluster')
axes[1, 0].set_ylabel('Average Customer Value ($)')

# Revenue efficiency (Revenue per Customer)
axes[1, 1].bar(performance_df['Cluster'], performance_df['Revenue_per_Customer'])
axes[1, 1].set_title('Revenue per Customer by Cluster')
axes[1, 1].set_xlabel('Cluster')
axes[1, 1].set_ylabel('Revenue per Customer ($)')

plt.tight_layout()
plt.show()

## 9. Export Results

In [None]:
# Save segmentation results
rfm_clustered.to_csv('../data/processed/customer_segments.csv', index=False)
performance_df.to_csv('../data/processed/segment_performance.csv', index=False)

print("Segmentation results saved:")
print("• Customer segments: ../data/processed/customer_segments.csv")
print("• Performance metrics: ../data/processed/segment_performance.csv")

# Summary statistics
print("\nSEGMENTATION SUMMARY:")
print("=" * 25)
print(f"Total customers analyzed: {len(rfm_clustered):,}")
print(f"Number of segments: {rfm_clustered['Cluster'].nunique()}")
print(f"Total revenue: ${rfm_clustered['Monetary'].sum():,.2f}")
print(f"Average customer value: ${rfm_clustered['Monetary'].mean():.2f}")
print(f"Silhouette score: {silhouette_score(segmenter.scaler.transform(rfm_clustered[['Recency', 'Frequency', 'Monetary']]), rfm_clustered['Cluster']):.3f}")

print("\n✅ Customer segmentation analysis completed successfully!")
print("📊 Ready for business strategy implementation and monitoring.")