In [None]:
import polars as pl
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo
import numpy as np
from datetime import datetime, timedelta
import os
from operator import attrgetter

def load_master_data_for_customer_segmentation():
    """Load master data for customer segmentation analysis"""
    print(" Loading master data for customer segmentation analysis...")
    
    try:
        if os.path.exists("data/master_transactions_with_coords.parquet"):
            master_data = pl.read_parquet("data/master_transactions_with_coords.parquet")
        else:
            master_data = pl.read_parquet("data/master_transactions.parquet")
        
        print(f" Master data loaded: {master_data.shape}")
        return master_data
        
    except Exception as e:
        print(f" Error loading master data: {e}")
        return None

def calculate_advanced_rfm_analysis(master_data):
    """Calculate advanced RFM analysis with additional customer metrics"""
    print("Calculating Advanced RFM Analysis...")
    
    # Filter for sales only
    sales_data = master_data.filter(pl.col("Transaction Type") == "Sale")
    
    # Get the latest date in the dataset as reference point
    latest_date = sales_data.select(pl.col("Date").max()).item()
    print(f"Analysis reference date: {latest_date}")
    
    # Calculate comprehensive RFM metrics
    # Calculate comprehensive RFM metrics
    rfm_data = sales_data.group_by("Customer ID").agg([
        # Core RFM metrics
        (pl.lit(latest_date) - pl.col("Date").max()).dt.total_days().alias("Recency_Days"),
        pl.col("Invoice ID").n_unique().alias("Frequency"),
        pl.col("Line_Total_USD").sum().alias("Monetary_Value"),
        
        # Additional customer behavior metrics
        pl.col("Date").min().alias("First_Purchase_Date"),
        pl.col("Date").max().alias("Last_Purchase_Date"),
        pl.col("Quantity").sum().alias("Total_Items_Purchased"),
        pl.col("Product ID").n_unique().alias("Unique_Products_Purchased"),
        pl.col("Category").n_unique().alias("Unique_Categories_Purchased"),
        pl.col("Store ID").n_unique().alias("Stores_Visited"),
        pl.col("Line_Total_USD").mean().alias("Average_Order_Value"),
        (pl.col("Line_Total_USD").sum() / pl.col("Quantity").sum()).alias("Average_Unit_Price"),
        
        # Seasonal behavior (simplified)
        pl.col("Date").dt.quarter().first().alias("First_Quarter"),
        pl.col("Date").dt.weekday().first().alias("First_Weekday"),
        pl.col("Date").dt.month().first().alias("First_Month"),
        
        # Product preferences
        pl.col("Category").first().alias("First_Category"),  # Changed from mode() to first()
        pl.col("Sub Category").first().alias("First_Sub_Category")  # Changed from mode() to first()
    ])

    
    # Convert to pandas for advanced calculations
    rfm_pandas = rfm_data.to_pandas()
    
    # Calculate customer lifetime and additional metrics
    rfm_pandas['Customer_Lifetime_Days'] = (rfm_pandas['Last_Purchase_Date'] - rfm_pandas['First_Purchase_Date']).dt.days
    rfm_pandas['Customer_Lifetime_Days'] = rfm_pandas['Customer_Lifetime_Days'].fillna(1).replace(0, 1)
    
    # Advanced behavioral metrics
    rfm_pandas['Monthly_Purchase_Frequency'] = (rfm_pandas['Frequency'] / rfm_pandas['Customer_Lifetime_Days'] * 30).fillna(1.0)
    rfm_pandas['Items_Per_Transaction'] = rfm_pandas['Total_Items_Purchased'] / rfm_pandas['Frequency']
    rfm_pandas['Categories_Per_Transaction'] = rfm_pandas['Unique_Categories_Purchased'] / rfm_pandas['Frequency']
    rfm_pandas['Store_Loyalty_Score'] = 1 / rfm_pandas['Stores_Visited']  # Higher = more loyal to specific stores
    rfm_pandas['Product_Diversity_Score'] = rfm_pandas['Unique_Products_Purchased'] / rfm_pandas['Frequency']
    
    # Calculate percentile-based RFM scores (1-5 scale)
    rfm_pandas['R_Score'] = pd.qcut(rfm_pandas['Recency_Days'], 5, labels=[5,4,3,2,1], duplicates='drop')  # Lower recency = higher score
    rfm_pandas['F_Score'] = pd.qcut(rfm_pandas['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5], duplicates='drop')
    rfm_pandas['M_Score'] = pd.qcut(rfm_pandas['Monetary_Value'], 5, labels=[1,2,3,4,5], duplicates='drop')
    
    # Handle any NaN values in scores
    rfm_pandas['R_Score'] = rfm_pandas['R_Score'].fillna(3)
    rfm_pandas['F_Score'] = rfm_pandas['F_Score'].fillna(3)
    rfm_pandas['M_Score'] = rfm_pandas['M_Score'].fillna(3)
    
    # Create RFM segment string
    rfm_pandas['RFM_Score'] = rfm_pandas['R_Score'].astype(str) + rfm_pandas['F_Score'].astype(str) + rfm_pandas['M_Score'].astype(str)
    
    print(f" Advanced RFM analysis completed for {len(rfm_pandas):,} customers")
    
    return rfm_pandas

def create_customer_segments(rfm_data):
    """Create detailed customer segments based on RFM and behavioral data - OPTIMIZED"""
    print(" Creating detailed customer segments...")
    
    # Convert R_Score, F_Score, M_Score to integers for faster operations
    rfm_data['R_Score'] = rfm_data['R_Score'].astype(int)
    rfm_data['F_Score'] = rfm_data['F_Score'].astype(int)
    rfm_data['M_Score'] = rfm_data['M_Score'].astype(int)
    
    # Vectorized RFM segmentation using numpy conditions
    conditions = [
        # Champions: High value, frequent, recent
        (rfm_data['R_Score'] >= 4) & (rfm_data['F_Score'] >= 4) & (rfm_data['M_Score'] >= 4),
        
        # Loyal Customers: High frequency, good monetary
        (rfm_data['F_Score'] >= 4) & (rfm_data['M_Score'] >= 3),
        
        # Potential Loyalists: Recent customers with good frequency
        (rfm_data['R_Score'] >= 3) & (rfm_data['F_Score'] >= 2) & (rfm_data['F_Score'] <= 3),
        
        # New Customers: Recent but low frequency
        (rfm_data['R_Score'] >= 4) & (rfm_data['F_Score'] <= 2),
        
        # Promising: Recent customers with potential
        (rfm_data['R_Score'] >= 3) & (rfm_data['F_Score'] <= 2) & (rfm_data['M_Score'] >= 2),
        
        # Need Attention: Above average recency, frequency & monetary
        (rfm_data['R_Score'] >= 2) & (rfm_data['F_Score'] >= 2) & (rfm_data['M_Score'] >= 2),
        
        # About to Sleep: Below average recency but good frequency
        (rfm_data['R_Score'] <= 2) & (rfm_data['F_Score'] >= 3),
        
        # At Risk: Good customers who haven't purchased recently
        (rfm_data['R_Score'] <= 2) & (rfm_data['F_Score'] >= 2) & (rfm_data['M_Score'] >= 3),
        
        # Cannot Lose Them: High value but low recency and frequency
        (rfm_data['F_Score'] >= 4) & (rfm_data['M_Score'] >= 4),
        
        # Hibernating: Low recency, frequency & monetary
        (rfm_data['R_Score'] <= 2) & (rfm_data['F_Score'] <= 2) & (rfm_data['M_Score'] <= 2)
    ]
    
    choices = [
        "Champions", "Loyal Customers", "Potential Loyalists", "New Customers", 
        "Promising", "Need Attention", "About to Sleep", "At Risk", 
        "Cannot Lose Them", "Hibernating"
    ]
    
   
    rfm_data['Customer_Segment'] = np.select(conditions, choices, default="Lost")
    
    # Vectorized behavioral segmentation
    aov_threshold = rfm_data['Average_Order_Value'].quantile(0.75)
    freq_threshold = rfm_data['Monthly_Purchase_Frequency'].quantile(0.75)
    cat_threshold = rfm_data['Unique_Categories_Purchased'].quantile(0.75)
    loyalty_threshold = rfm_data['Store_Loyalty_Score'].quantile(0.75)
    
    behavioral_conditions = [
        (rfm_data['Average_Order_Value'] >= aov_threshold) & (rfm_data['Monthly_Purchase_Frequency'] >= freq_threshold),
        (rfm_data['Average_Order_Value'] >= aov_threshold) & (rfm_data['Monthly_Purchase_Frequency'] < freq_threshold),
        rfm_data['Monthly_Purchase_Frequency'] >= freq_threshold,
        rfm_data['Unique_Categories_Purchased'] >= cat_threshold,
        rfm_data['Store_Loyalty_Score'] >= loyalty_threshold
    ]
    
    behavioral_choices = [
        "High Value Frequent", "High Value Occasional", "Frequent Shoppers", 
        "Category Explorers", "Store Loyalists"
    ]
    
    rfm_data['Behavioral_Segment'] = np.select(behavioral_conditions, behavioral_choices, default="Regular Customers")
    
    # Vectorized value segmentation
    high_value = rfm_data['Monetary_Value'].quantile(0.8)
    medium_value = rfm_data['Monetary_Value'].quantile(0.5)
    high_freq = rfm_data['Frequency'].quantile(0.7)
    
    value_conditions = [
        (rfm_data['Monetary_Value'] >= high_value) & (rfm_data['Frequency'] >= high_freq),
        (rfm_data['Monetary_Value'] >= high_value) & (rfm_data['Frequency'] < high_freq),
        (rfm_data['Monetary_Value'] >= medium_value) & (rfm_data['Frequency'] >= high_freq),
        (rfm_data['Monetary_Value'] >= medium_value) & (rfm_data['Frequency'] < high_freq),
        (rfm_data['Monetary_Value'] < medium_value) & (rfm_data['Frequency'] >= high_freq)
    ]
    
    value_choices = [
        "VIP Customers", "Big Spenders", "Loyal Regulars", 
        "Medium Value", "Frequent Low Spenders"
    ]
    
    rfm_data['Value_Segment'] = np.select(value_conditions, value_choices, default="Low Value")
    
    print(f" Customer segmentation completed")
    print(f" RFM Segments: {rfm_data['Customer_Segment'].nunique()}")
    print(f" Behavioral Segments: {rfm_data['Behavioral_Segment'].nunique()}")
    print(f" Value Segments: {rfm_data['Value_Segment'].nunique()}")
    
    return rfm_data


def calculate_customer_lifetime_value(rfm_data):
    """Calculate Customer Lifetime Value (CLV) using computational methods"""
    print(" Calculating Customer Lifetime Value...")
    
    # Historical CLV (what customer has already spent)
    rfm_data['Historical_CLV'] = rfm_data['Monetary_Value']
    
    # Predicted CLV based on purchase patterns
    # Method 1: Simple frequency-based prediction
    rfm_data['Avg_Days_Between_Purchases'] = rfm_data['Customer_Lifetime_Days'] / (rfm_data['Frequency'] - 1)
    rfm_data['Avg_Days_Between_Purchases'] = rfm_data['Avg_Days_Between_Purchases'].fillna(rfm_data['Customer_Lifetime_Days'])
    
    # Estimate future purchases in next 12 months
    rfm_data['Estimated_Future_Purchases'] = np.where(
        rfm_data['Avg_Days_Between_Purchases'] > 0,
        365 / rfm_data['Avg_Days_Between_Purchases'],
        rfm_data['Monthly_Purchase_Frequency'] * 12
    )
    
    # Predicted CLV (conservative estimate)
    rfm_data['Predicted_CLV'] = rfm_data['Average_Order_Value'] * rfm_data['Estimated_Future_Purchases']
    
    # Total CLV (Historical + Predicted)
    rfm_data['Total_CLV'] = rfm_data['Historical_CLV'] + rfm_data['Predicted_CLV']
    
    # CLV segments based on total CLV
    rfm_data['CLV_Segment'] = pd.qcut(
        rfm_data['Total_CLV'], 
        q=5, 
        labels=['Very Low CLV', 'Low CLV', 'Medium CLV', 'High CLV', 'Very High CLV'],
        duplicates='drop'
    )
    
    # Customer risk assessment based on recency and frequency trends
    def risk_assessment(row):
        recency = row['Recency_Days']
        avg_gap = row['Avg_Days_Between_Purchases']
        
        if recency > avg_gap * 2:
            return "High Risk"
        elif recency > avg_gap * 1.5:
            return "Medium Risk"
        else:
            return "Low Risk"
    
    rfm_data['Churn_Risk'] = rfm_data.apply(risk_assessment, axis=1)
    
    print(" CLV calculation completed")
    
    return rfm_data

def create_cohort_analysis(master_data):
    """Create customer cohort analysis"""
    print(" Creating cohort analysis...")
    
    # Filter for sales only
    sales_data = master_data.filter(pl.col("Transaction Type") == "Sale")
    
    # Convert to pandas for cohort analysis
    df = sales_data.select([
        "Customer ID", "Date", "Line_Total_USD"
    ]).to_pandas()
    
    # Get customer's first purchase date
    df['Order_Period'] = df['Date'].dt.to_period('M')
    df['Cohort_Group'] = df.groupby('Customer ID')['Date'].transform('min').dt.to_period('M')
    
    # Calculate period number
    df['Period_Number'] = (df['Order_Period'] - df['Cohort_Group']).apply(attrgetter('n'))
    
    # Create cohort table for customer retention
    cohort_data = df.groupby(['Cohort_Group', 'Period_Number'])['Customer ID'].nunique().reset_index()
    cohort_counts = cohort_data.pivot(index='Cohort_Group', columns='Period_Number', values='Customer ID')
    
    # Calculate cohort sizes (first month customers)
    cohort_sizes = df.groupby('Cohort_Group')['Customer ID'].nunique()
    cohort_table = cohort_counts.divide(cohort_sizes, axis=0)
    
    # Revenue cohort analysis
    revenue_cohort_data = df.groupby(['Cohort_Group', 'Period_Number'])['Line_Total_USD'].sum().reset_index()
    revenue_cohort_table = revenue_cohort_data.pivot(index='Cohort_Group', columns='Period_Number', values='Line_Total_USD')
    
    print(" Cohort analysis completed")
    
    return cohort_table, cohort_counts, revenue_cohort_table

def analyze_customer_journey(master_data):
    """Analyze customer journey and purchase patterns"""
    print(" Analyzing customer journey patterns...")
    
    # Filter for sales only
    sales_data = master_data.filter(pl.col("Transaction Type") == "Sale")
    
    # Customer journey metrics
    journey_analysis = sales_data.group_by("Customer ID").agg([
        pl.col("Date").min().alias("First_Purchase"),
        pl.col("Date").max().alias("Last_Purchase"),
        pl.col("Invoice ID").n_unique().alias("Total_Transactions"),
        pl.col("Line_Total_USD").sum().alias("Total_Spent"),
        pl.col("Store ID").n_unique().alias("Stores_Visited"),
        pl.col("Category").n_unique().alias("Categories_Explored"),
        
        # Purchase evolution
        pl.col("Line_Total_USD").first().alias("First_Purchase_Amount"),
        pl.col("Line_Total_USD").last().alias("Last_Purchase_Amount"),
        pl.col("Line_Total_USD").mean().alias("Average_Purchase_Amount"),
        pl.col("Line_Total_USD").max().alias("Highest_Purchase_Amount")
    ]).to_pandas()
    
    # Calculate journey metrics
    journey_analysis['Customer_Lifespan_Days'] = (journey_analysis['Last_Purchase'] - journey_analysis['First_Purchase']).dt.days
    journey_analysis['Purchase_Growth'] = ((journey_analysis['Last_Purchase_Amount'] - journey_analysis['First_Purchase_Amount']) / 
                                         journey_analysis['First_Purchase_Amount'] * 100).fillna(0)
    
    # Journey stages
    def journey_stage(row):
        transactions = row['Total_Transactions']
        lifespan = row['Customer_Lifespan_Days']
        
        if transactions == 1:
            return "One-Time Buyer"
        elif transactions <= 3 and lifespan <= 90:
            return "Early Stage"
        elif transactions <= 5 and lifespan <= 180:
            return "Developing"
        elif transactions <= 10:
            return "Established"
        else:
            return "Mature"
    
    journey_analysis['Journey_Stage'] = journey_analysis.apply(journey_stage, axis=1)
    
    print(" Customer journey analysis completed")
    
    return journey_analysis

def create_customer_segmentation_dashboard(rfm_data, cohort_table, revenue_cohort_table, journey_analysis):
    """Create comprehensive customer segmentation dashboard"""
    print(" Creating Customer Segmentation Dashboard...")
    
    # Create HTML structure
    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Advanced Customer Segmentation Analysis Dashboard</title>
        <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }
            .chart-container { background-color: white; margin: 20px 0; padding: 20px; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
            .chart-title { font-size: 24px; font-weight: bold; text-align: center; margin-bottom: 20px; color: #333; }
            .dashboard-title { font-size: 36px; font-weight: bold; text-align: center; margin-bottom: 30px; color: #2c3e50; }
            .insights-box { background-color: #e8f4f8; padding: 15px; margin: 10px 0; border-radius: 8px; font-size: 14px; }
            .metric-highlight { background-color: #fff3cd; padding: 10px; margin: 5px 0; border-radius: 5px; font-weight: bold; }
            .segment-summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin: 20px 0; }
            .segment-card { background-color: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #007bff; }
        </style>
    </head>
    <body>
        <div class="dashboard-title"> Advanced Customer Segmentation Analysis</div>
        
        <div class="insights-box">
            <strong> Customer Segmentation Insights:</strong><br>
            • <strong>RFM Analysis</strong>: Segments customers based on Recency, Frequency, and Monetary value<br>
            • <strong>Behavioral Segmentation</strong>: Groups customers by shopping patterns and preferences<br>
            • <strong>Customer Lifetime Value</strong>: Predicts future value and identifies high-value customers<br>
            • <strong>Cohort Analysis</strong>: Tracks customer retention and revenue patterns over time<br>
            • <strong>Journey Analysis</strong>: Maps customer evolution and purchase progression
        </div>
    """
    
    # 1. RFM Analysis 3D Scatter Plot
    fig1 = px.scatter_3d(
        rfm_data,
        x='Recency_Days',
        y='Frequency', 
        z='Monetary_Value',
        color='Customer_Segment',
        size='Total_CLV',
        hover_name='Customer ID',
        hover_data={
            'Average_Order_Value': ':.2f',
            'Total_Items_Purchased': ':,',
            'Stores_Visited': True,
            'CLV_Segment': True
        },
        title=" 3D RFM Analysis: Customer Segmentation",
        labels={
            'Recency_Days': 'Recency (Days)',
            'Frequency': 'Purchase Frequency',
            'Monetary_Value': 'Total Spent (USD)'
        },
        height=700
    )
    fig1.update_layout(template="plotly_white")
    
    # 2. Customer Segment Distribution
    segment_counts = rfm_data['Customer_Segment'].value_counts()
    fig2 = px.pie(
        values=segment_counts.values,
        names=segment_counts.index,
        title=" Customer Segment Distribution",
        height=600
    )
    fig2.update_traces(textposition='inside', textinfo='percent+label')
    fig2.update_layout(template="plotly_white")
    
    # 3. CLV Analysis by Segment
    clv_by_segment = rfm_data.groupby('Customer_Segment').agg({
        'Total_CLV': ['mean', 'sum', 'count'],
        'Historical_CLV': 'mean',
        'Predicted_CLV': 'mean'
    }).round(2)
    
    fig3 = px.bar(
        x=clv_by_segment.index,
        y=clv_by_segment[('Total_CLV', 'mean')],
        title=" Average Customer Lifetime Value by Segment",
        labels={'x': 'Customer Segment', 'y': 'Average CLV (USD)'},
        height=600
    )
    fig3.update_layout(template="plotly_white", xaxis_tickangle=45)
    
    # 4. Behavioral Segmentation
    behavioral_counts = rfm_data['Behavioral_Segment'].value_counts()
    fig4 = px.bar(
        x=behavioral_counts.index,
        y=behavioral_counts.values,
        title=" Behavioral Segmentation Distribution",
        labels={'x': 'Behavioral Segment', 'y': 'Number of Customers'},
        height=600
    )
    fig4.update_layout(template="plotly_white", xaxis_tickangle=45)
    
    # 5. Customer Journey Stage Analysis
    journey_counts = journey_analysis['Journey_Stage'].value_counts()
    fig5 = px.funnel(
        x=journey_counts.values,
        y=journey_counts.index,
        title=" Customer Journey Stage Distribution",
        height=600
    )
    fig5.update_layout(template="plotly_white")
    
    # 6. Cohort Retention Heatmap
    fig6 = px.imshow(
        cohort_table.iloc[:12, :12].values,  # Show first 12 months of first 12 cohorts
        labels=dict(x="Period", y="Cohort Month", color="Retention Rate"),
        x=[f"Month {i}" for i in range(12)],
        y=[str(cohort)[:7] for cohort in cohort_table.index[:12]],
        title=" Customer Retention Cohort Analysis",
        color_continuous_scale="RdYlGn",
        height=600
    )
    fig6.update_layout(template="plotly_white")
    
    # 7. Revenue Cohort Analysis
    fig7 = px.imshow(
        revenue_cohort_table.iloc[:12, :12].fillna(0).values,
        labels=dict(x="Period", y="Cohort Month", color="Revenue (USD)"),
        x=[f"Month {i}" for i in range(12)],
        y=[str(cohort)[:7] for cohort in revenue_cohort_table.index[:12]],
        title=" Revenue Cohort Analysis",
        color_continuous_scale="Blues",
        height=600
    )
    fig7.update_layout(template="plotly_white")
    
    # 8. Customer Value vs Risk Matrix
    fig8 = px.scatter(
        rfm_data,
        x='Total_CLV',
        y='Recency_Days',
        color='Churn_Risk',
        size='Frequency',
        hover_name='Customer ID',
        hover_data={
            'Customer_Segment': True,
            'Average_Order_Value': ':.2f',
            'Monthly_Purchase_Frequency': ':.2f'
        },
        title=" Customer Value vs Churn Risk Matrix",
        labels={
            'Total_CLV': 'Customer Lifetime Value (USD)',
            'Recency_Days': 'Days Since Last Purchase'
        },
        height=700
    )
    fig8.update_layout(template="plotly_white")
    
    # 9. Purchase Behavior Analysis
    fig9 = px.scatter(
        rfm_data,
        x='Average_Order_Value',
        y='Monthly_Purchase_Frequency',
        color='Value_Segment',
        size='Total_Items_Purchased',
        hover_name='Customer ID',
        title=" Purchase Behavior Analysis",
        labels={
            'Average_Order_Value': 'Average Order Value (USD)',
            'Monthly_Purchase_Frequency': 'Monthly Purchase Frequency'
        },
        height=600
    )
    fig9.update_layout(template="plotly_white")
    
    # 10. Customer Loyalty Analysis
    fig10 = px.scatter(
        rfm_data,
        x='Store_Loyalty_Score',
        y='Product_Diversity_Score',
        color='Behavioral_Segment',
        size='Monetary_Value',
        hover_name='Customer ID',
        title=" Customer Loyalty vs Product Diversity",
        labels={
            'Store_Loyalty_Score': 'Store Loyalty Score',
            'Product_Diversity_Score': 'Product Diversity Score'
        },
        height=600
    )
    fig10.update_layout(template="plotly_white")
    
    # Convert all figures to HTML
    figures = [fig1, fig2, fig3, fig4, fig5, fig6, fig7, fig8, fig9, fig10]
    titles = [
        "3D RFM Analysis: Customer Segmentation",
        "Customer Segment Distribution", 
        "Average Customer Lifetime Value by Segment",
        "Behavioral Segmentation Distribution",
        "Customer Journey Stage Distribution",
        "Customer Retention Cohort Analysis",
        "Revenue Cohort Analysis",
        "Customer Value vs Churn Risk Matrix",
        "Purchase Behavior Analysis",
        "Customer Loyalty vs Product Diversity"
    ]
    
    for i, (fig, title) in enumerate(zip(figures, titles)):
        html_content += f"""
        <div class="chart-container">
            <div class="chart-title">{title}</div>
            <div id="chart{i+1}"></div>
        </div>
        """
    
    # Add segment summary cards
    segment_summary = rfm_data.groupby('Customer_Segment').agg({
        'Customer ID': 'count',
        'Total_CLV': 'mean',
        'Average_Order_Value': 'mean',
        'Monthly_Purchase_Frequency': 'mean',
        'Recency_Days': 'mean'
    }).round(2)
    
    html_content += """
        <div class="chart-container">
            <div class="chart-title"> Customer Segment Summary</div>
            <div class="segment-summary">
    """
    
    for segment in segment_summary.index:
        data = segment_summary.loc[segment]
        html_content += f"""
            <div class="segment-card">
                <h3>{segment}</h3>
                <p><strong>Customers:</strong> {data['Customer ID']:,}</p>
                <p><strong>Avg CLV:</strong> ${data['Total_CLV']:,.2f}</p>
                <p><strong>Avg Order Value:</strong> ${data['Average_Order_Value']:,.2f}</p>
                <p><strong>Monthly Frequency:</strong> {data['Monthly_Purchase_Frequency']:.2f}</p>
                <p><strong>Avg Recency:</strong> {data['Recency_Days']:.0f} days</p>
            </div>
        """
    
    html_content += """
            </div>
        </div>
    """
    
    # Add key insights
    total_customers = len(rfm_data)
    total_clv = rfm_data['Total_CLV'].sum()
    avg_clv = rfm_data['Total_CLV'].mean()
    champions = len(rfm_data[rfm_data['Customer_Segment'] == 'Champions'])
    at_risk = len(rfm_data[rfm_data['Customer_Segment'] == 'At Risk'])
    
    html_content += f"""
        <div class="insights-box">
            <strong> Key Customer Insights:</strong><br>
            <div class="metric-highlight"> Total Customers Analyzed: {total_customers:,}</div>
            <div class="metric-highlight"> Total Customer Lifetime Value: ${total_clv:,.0f}</div>
            <div class="metric-highlight"> Average CLV per Customer: ${avg_clv:,.2f}</div>
            <div class="metric-highlight"> Champion Customers: {champions:,} ({champions/total_customers*100:.1f}%)</div>
            <div class="metric-highlight"> At-Risk Customers: {at_risk:,} ({at_risk/total_customers*100:.1f}%)</div>
        </div>
    """
    
    html_content += """
    <script>
    """
    
    # Add JavaScript for each plot
    for i, fig in enumerate(figures):
        plot_json = fig.to_json()
        html_content += f"""
        var plotData{i+1} = {plot_json};
        Plotly.newPlot('chart{i+1}',
        plotData{i+1}.data, plotData{i+1}.layout, {{responsive: true}});
        """
    
    html_content += """
    </script>
    </body>
    </html>
    """
    
    return html_content


In [2]:

def save_customer_segmentation_data(rfm_data, cohort_table, revenue_cohort_table, journey_analysis):
    """Save customer segmentation analysis data files"""
    print(" Saving customer segmentation analysis data files...")
    
    os.makedirs("customer_segmentation/data", exist_ok=True)
    
    # Save main datasets
    rfm_data.to_csv("customer_segmentation/data/rfm_customer_analysis.csv", index=False)
    cohort_table.to_csv("customer_segmentation/data/customer_retention_cohorts.csv")
    revenue_cohort_table.to_csv("customer_segmentation/data/revenue_cohorts.csv")
    journey_analysis.to_csv("customer_segmentation/data/customer_journey_analysis.csv", index=False)
    
    # Create segment-specific files
    for segment in rfm_data['Customer_Segment'].unique():
        segment_data = rfm_data[rfm_data['Customer_Segment'] == segment]
        filename = f"customer_segmentation/data/segment_{segment.lower().replace(' ', '_')}.csv"
        segment_data.to_csv(filename, index=False)
    
    # Create summary reports
    segment_summary = rfm_data.groupby('Customer_Segment').agg({
        'Customer ID': 'count',
        'Total_CLV': ['mean', 'sum', 'std'],
        'Historical_CLV': 'mean',
        'Predicted_CLV': 'mean',
        'Average_Order_Value': 'mean',
        'Monthly_Purchase_Frequency': 'mean',
        'Recency_Days': 'mean',
        'Frequency': 'mean',
        'Monetary_Value': 'mean',
        'Total_Items_Purchased': 'mean',
        'Unique_Products_Purchased': 'mean',
        'Stores_Visited': 'mean'
    }).round(2)
    
    segment_summary.to_csv("customer_segmentation/data/segment_summary_report.csv")
    
    # CLV analysis by segment
    clv_analysis = rfm_data.groupby(['Customer_Segment', 'CLV_Segment']).size().unstack(fill_value=0)
    clv_analysis.to_csv("customer_segmentation/data/clv_by_segment_analysis.csv")
    
    # Risk analysis
    risk_analysis = rfm_data.groupby(['Customer_Segment', 'Churn_Risk']).size().unstack(fill_value=0)
    risk_analysis.to_csv("customer_segmentation/data/churn_risk_by_segment.csv")
    
    print(" Customer segmentation data files saved!")

def generate_customer_insights_report(rfm_data, journey_analysis):
    """Generate detailed customer insights and recommendations"""
    print(" Generating customer insights and recommendations...")
    
    insights = {
        'segment_insights': {},
        'clv_insights': {},
        'risk_insights': {},
        'behavioral_insights': {},
        'recommendations': {}
    }
    
    # Segment-specific insights
    for segment in rfm_data['Customer_Segment'].unique():
        segment_data = rfm_data[rfm_data['Customer_Segment'] == segment]
        
        insights['segment_insights'][segment] = {
            'count': len(segment_data),
            'percentage': len(segment_data) / len(rfm_data) * 100,
            'avg_clv': segment_data['Total_CLV'].mean(),
            'avg_aov': segment_data['Average_Order_Value'].mean(),
            'avg_frequency': segment_data['Monthly_Purchase_Frequency'].mean(),
            'avg_recency': segment_data['Recency_Days'].mean(),
            'revenue_contribution': segment_data['Monetary_Value'].sum() / rfm_data['Monetary_Value'].sum() * 100
        }
    
    # CLV insights
    high_clv_customers = rfm_data[rfm_data['CLV_Segment'].isin(['High CLV', 'Very High CLV'])]
    insights['clv_insights'] = {
        'high_clv_count': len(high_clv_customers),
        'high_clv_percentage': len(high_clv_customers) / len(rfm_data) * 100,
        'high_clv_revenue_share': high_clv_customers['Monetary_Value'].sum() / rfm_data['Monetary_Value'].sum() * 100,
        'avg_clv_all': rfm_data['Total_CLV'].mean(),
        'top_10_percent_clv': rfm_data.nlargest(int(len(rfm_data) * 0.1), 'Total_CLV')['Total_CLV'].mean()
    }
    
    # Risk insights
    at_risk_customers = rfm_data[rfm_data['Churn_Risk'] == 'High Risk']
    insights['risk_insights'] = {
        'high_risk_count': len(at_risk_customers),
        'high_risk_percentage': len(at_risk_customers) / len(rfm_data) * 100,
        'potential_revenue_loss': at_risk_customers['Predicted_CLV'].sum(),
        'avg_recency_at_risk': at_risk_customers['Recency_Days'].mean()
    }
    
    # Behavioral insights
    insights['behavioral_insights'] = {
        'multi_category_shoppers': len(rfm_data[rfm_data['Unique_Categories_Purchased'] > 3]) / len(rfm_data) * 100,
        'store_loyalists': len(rfm_data[rfm_data['Stores_Visited'] == 1]) / len(rfm_data) * 100,
        'frequent_shoppers': len(rfm_data[rfm_data['Monthly_Purchase_Frequency'] > 2]) / len(rfm_data) * 100,
        'high_aov_customers': len(rfm_data[rfm_data['Average_Order_Value'] > rfm_data['Average_Order_Value'].quantile(0.8)]) / len(rfm_data) * 100
    }
    
    # Generate recommendations
    insights['recommendations'] = {
        'champions': "Focus on retention programs, VIP experiences, and referral incentives",
        'loyal_customers': "Reward loyalty with exclusive offers and early access to new products",
        'potential_loyalists': "Nurture with personalized recommendations and loyalty programs",
        'new_customers': "Onboard with welcome series and product education",
        'at_risk': "Immediate re-engagement campaigns with special offers",
        'cannot_lose_them': "Win-back campaigns with premium service recovery",
        'hibernating': "Reactivation campaigns with significant incentives",
        'lost': "Final win-back attempt or remove from active marketing"
    }
    
    return insights

def run_customer_segmentation_analysis():
    """Run comprehensive customer segmentation analysis"""
    print(" Starting Advanced Customer Segmentation Analysis...")
    
    # Load data
    master_data = load_master_data_for_customer_segmentation()
    if master_data is None:
        print(" Failed to load master data!")
        return None
    
    # Calculate RFM analysis
    print("\n Step 1: Calculating RFM Analysis...")
    rfm_data = calculate_advanced_rfm_analysis(master_data)
    
    # Create customer segments
    print("\n Step 2: Creating Customer Segments...")
    rfm_data = create_customer_segments(rfm_data)
    
    # Calculate CLV
    print("\n Step 3: Calculating Customer Lifetime Value...")
    rfm_data = calculate_customer_lifetime_value(rfm_data)
    
    # Cohort analysis
    print("\n Step 4: Performing Cohort Analysis...")
    cohort_table, cohort_counts, revenue_cohort_table = create_cohort_analysis(master_data)
    
    # Customer journey analysis
    print("\n Step 5: Analyzing Customer Journey...")
    journey_analysis = analyze_customer_journey(master_data)
    
    # Generate insights
    print("\n Step 6: Generating Insights and Recommendations...")
    insights = generate_customer_insights_report(rfm_data, journey_analysis)
    
    # Save data files
    print("\n Step 7: Saving Analysis Data...")
    save_customer_segmentation_data(rfm_data, cohort_table, revenue_cohort_table, journey_analysis)
    
    # Create dashboard
    print("\n Step 8: Creating Interactive Dashboard...")
    html_content = create_customer_segmentation_dashboard(rfm_data, cohort_table, revenue_cohort_table, journey_analysis)
    
    # Create directory and save HTML
    os.makedirs("customer_segmentation", exist_ok=True)
    
    with open("customer_segmentation/customer_segmentation_dashboard.html", "w", encoding='utf-8') as f:
        f.write(html_content)
    
    # Print comprehensive summary
    print("\n" + "="*80)
    print(" CUSTOMER SEGMENTATION ANALYSIS COMPLETE!")
    print("="*80)
    
    print(f"\n CUSTOMER OVERVIEW:")
    print(f"    Total Customers Analyzed: {len(rfm_data):,}")
    print(f"    Total Revenue: ${rfm_data['Monetary_Value'].sum():,.2f}")
    print(f"    Average CLV: ${rfm_data['Total_CLV'].mean():,.2f}")
    print(f"    Average Order Value: ${rfm_data['Average_Order_Value'].mean():.2f}")
    
    print(f"\n SEGMENT DISTRIBUTION:")
    segment_dist = rfm_data['Customer_Segment'].value_counts()
    for segment, count in segment_dist.head(5).items():
        percentage = count / len(rfm_data) * 100
        print(f"   • {segment}: {count:,} customers ({percentage:.1f}%)")
    
    print(f"\n HIGH-VALUE CUSTOMERS:")
    champions = rfm_data[rfm_data['Customer_Segment'] == 'Champions']
    print(f"    Champions: {len(champions):,} customers")
    print(f"  Champions Revenue: ${champions['Monetary_Value'].sum():,.2f}")
    print(f"   Champions CLV: ${champions['Total_CLV'].mean():,.2f}")
    
    print(f"\n AT-RISK CUSTOMERS:")
    at_risk = rfm_data[rfm_data['Churn_Risk'] == 'High Risk']
    print(f"    High Risk: {len(at_risk):,} customers ({len(at_risk)/len(rfm_data)*100:.1f}%)")
    print(f"    Potential Revenue Loss: ${at_risk['Predicted_CLV'].sum():,.2f}")
    
    print(f"\n BEHAVIORAL INSIGHTS:")
    print(f"  Multi-Category Shoppers: {insights['behavioral_insights']['multi_category_shoppers']:.1f}%")
    print(f"  Store Loyalists: {insights['behavioral_insights']['store_loyalists']:.1f}%")
    print(f"  Frequent Shoppers: {insights['behavioral_insights']['frequent_shoppers']:.1f}%")
    
    print(f"\n FILES CREATED:")
    print(f"   customer_segmentation/customer_segmentation_dashboard.html")
    print(f"   customer_segmentation/data/ (Multiple CSV files)")
    print(f"  Segment-specific customer lists")
    print(f"   Summary reports and analysis")
    
    print(f"\n TOP RECOMMENDATIONS:")
    print(f"   1. Focus retention efforts on {len(champions):,} Champion customers")
    print(f"   2. Immediate re-engagement for {len(at_risk):,} at-risk customers")
    print(f"   3. Develop loyalty programs for Potential Loyalists")
    print(f"   4. Create win-back campaigns for hibernating customers")
    
    return {
        'rfm_data': rfm_data,
        'cohort_table': cohort_table,
        'revenue_cohort_table': revenue_cohort_table,
        'journey_analysis': journey_analysis,
        'insights': insights
    }

# Run the customer segmentation analysis
print(" Initializing Advanced Customer Segmentation Analysis...")
results = run_customer_segmentation_analysis()

if results:
    print("\n Analysis completed successfully!")
    print("Open the dashboard to explore detailed customer insights and segments")
else:
    print("\n Analysis failed. Please check the data and try again.")


 Initializing Advanced Customer Segmentation Analysis...
 Starting Advanced Customer Segmentation Analysis...
 Loading master data for customer segmentation analysis...
 Master data loaded: (6416827, 45)

 Step 1: Calculating RFM Analysis...
Calculating Advanced RFM Analysis...
Analysis reference date: 2025-03-18
 Advanced RFM analysis completed for 1,283,707 customers

 Step 2: Creating Customer Segments...
 Creating detailed customer segments...
 Customer segmentation completed
 RFM Segments: 10
 Behavioral Segments: 6
 Value Segments: 6

 Step 3: Calculating Customer Lifetime Value...
 Calculating Customer Lifetime Value...
 CLV calculation completed

 Step 4: Performing Cohort Analysis...
 Creating cohort analysis...
 Cohort analysis completed

 Step 5: Analyzing Customer Journey...
 Analyzing customer journey patterns...
 Customer journey analysis completed

 Step 6: Generating Insights and Recommendations...
 Generating customer insights and recommendations...

 Step 7: Saving An





 Customer segmentation data files saved!

 Step 8: Creating Interactive Dashboard...
 Creating Customer Segmentation Dashboard...

 CUSTOMER SEGMENTATION ANALYSIS COMPLETE!

 CUSTOMER OVERVIEW:
    Total Customers Analyzed: 1,283,707
    Total Revenue: $305,884,836.55
    Average CLV: $410.86
    Average Order Value: $50.16

 SEGMENT DISTRIBUTION:
   • Potential Loyalists: 268,853 customers (20.9%)
   • Champions: 250,529 customers (19.5%)
   • Loyal Customers: 241,444 customers (18.8%)
   • Hibernating: 229,119 customers (17.8%)
   • Need Attention: 126,175 customers (9.8%)

 HIGH-VALUE CUSTOMERS:
    Champions: 250,529 customers
  Champions Revenue: $126,578,308.19
   Champions CLV: $696.86

 AT-RISK CUSTOMERS:
    High Risk: 203,138 customers (15.8%)
    Potential Revenue Loss: $123,512,118.84

 BEHAVIORAL INSIGHTS:
  Multi-Category Shoppers: 0.0%
  Store Loyalists: 75.5%
  Frequent Shoppers: 28.3%

 FILES CREATED:
   customer_segmentation/customer_segmentation_dashboard.html
   cus

In [1]:
# Add this line before loading your data
import gc; gc.collect(); print(f"🧹 Memory cleared: {gc.collect()} objects")

🧹 Memory cleared: 0 objects
