# Customer Journey Analysis
### This notebook analyzes customer journeys across different products, visualizing patterns in purchasing behavior, demographics, and product adoption sequences.

## Import and plot-style

In [1]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pathlib import Path
import jax
import jax.numpy as jnp
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
from typing import List, Dict, Tuple

# Set style for better visualizations
plt.style.use("seaborn-v0_8-dark-palette")
sns.set_palette("husl")
%matplotlib inline

## Data Loading

In [2]:
# We'll load all ABT_score files and combine them with appropriate target labels.
def load_abt_files():
    """Load all ABT_score files and combine them with appropriate target labels"""
    abt_files = list(Path('../data').glob('ABT_[Ss]core_*.csv'))
    
    if not abt_files:
        print("No ABT_score_*.csv files found in current directory!")
        print("\nCurrent directory contents:")
        print([f.name for f in Path('../data').glob('*')])
        print("\nPlease ensure your ABT_score_*.csv files are in the data directory.")
        return None
    
    print(f"Found {len(abt_files)} ABT_score files:")
    for f in abt_files:
        print(f"  - {f.name}")
    
    dfs = []
    for file_path in abt_files:
        product = file_path.stem.split('_')[-1]
        try:
            print(f"\nLoading {product} data...")
            df = pd.read_csv(file_path, sep=';')
            print(f"Successfully loaded {len(df)} rows for {product}")
            df['product_type'] = product
            dfs.append(df)
        except Exception as e:
            print(f"Error loading {file_path.name}: {str(e)}")
    
    return pd.concat(dfs, ignore_index=True)

# Load the data
combined_df = load_abt_files()

Found 3 ABT_score files:
  - ABT_score_BoKvar.csv
  - ABT_Score_Example.csv
  - ABT_score_BankBolan.csv

Loading BoKvar data...
Successfully loaded 25405 rows for BoKvar

Loading Example data...
Successfully loaded 0 rows for Example

Loading BankBolan data...
Successfully loaded 102027 rows for BankBolan


## Data Preprocessing

In [3]:
def preprocess_data(df):
    """Clean and preprocess the combined dataset"""
    df = df.copy()
    
    # Convert date columns to datetime
    date_columns = [col for col in df.columns if 'Date' in col or 'date' in col or 
                   col.startswith(('mFirst_', 'mLast_'))]
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Fill numeric NaNs with 0
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].fillna(0)
    
    # Convert binary columns to int
    binary_columns = [col for col in df.columns if col.startswith(('Have_', 'Had_', 'Optout_'))]
    for col in binary_columns:
        df[col] = df[col].fillna(0).astype(int)
    
    return df

combined_df = preprocess_data(combined_df)

## Customer Journey Analysis

In [None]:
# Analyze the sequence of products purchased by customers.
def analyze_product_sequence(df):
    """Analyze the sequence of products purchased by customers"""
    if len(df) == 0:
        raise ValueError("Empty dataframe provided!")
        
    product_cols = [col for col in df.columns if col.startswith('mFirst_')]
    if not product_cols:
        raise ValueError("No product columns found!")
    
    timeline_data = []
    customer_journeys = {}
    
    for customer_id in df['sCustomerNaturalKey'].unique():
        customer_data = df[df['sCustomerNaturalKey'] == customer_id]
        
        # Get product acquisition dates
        products = []
        for col in product_cols:
            product = col.replace('mFirst_', '')
            date = customer_data[col].iloc[0]
            if pd.notna(date):
                products.append({
                    'sCustomerNaturalKey': customer_id,
                    'product': product,
                    'acquisition_date': date
                })
        
        # Sort products by date
        products = sorted(products, key=lambda x: x['acquisition_date'])
        timeline_data.extend(products)
        
        # Create journey sequence
        if products:
            journey = ' → '.join([p['product'] for p in products])
            customer_journeys[customer_id] = {
                'sequence': journey,
                'length': len(products),
                'duration_days': (products[-1]['acquisition_date'] - products[0]['acquisition_date']).days,
                'first_product': products[0]['product'],
                'last_product': products[-1]['product']
            }
    
    return pd.DataFrame(timeline_data), pd.DataFrame.from_dict(customer_journeys, orient='index')

# Analyze product sequences
timeline_df, journey_df = analyze_product_sequence(combined_df)

## Visualizations

In [None]:
def plot_customer_journey_sankey(journey_df, max_paths=20, min_customers=5):
    """
    Create an improved Sankey diagram showing actual journey flows
    
    Parameters:
    journey_df: DataFrame with customer journeys
    max_paths: Maximum number of unique paths to show
    min_customers: Minimum number of customers for a path to be included
    """
    # Get sequences with their counts
    sequence_counts = journey_df['sequence'].value_counts()
    sequence_counts = sequence_counts[sequence_counts >= min_customers].head(max_paths)
    
    # Create nodes and links
    nodes = set()
    links = []
    link_values = []
    
    # Add "Start" node
    nodes.add("Start")
    
    for sequence, count in sequence_counts.items():
        products = sequence.split(' → ')
        
        # Add all products to nodes
        nodes.update(products)
        
        # Add link from Start to first product
        links.append(("Start", products[0]))
        link_values.append(count)
        
        # Create links between consecutive products
        for i in range(len(products) - 1):
            links.append((products[i], products[i + 1]))
            link_values.append(count)
    
    # Convert nodes to list and create node indices
    nodes = list(nodes)
    node_indices = {node: i for i, node in enumerate(nodes)}
    
    # Create color scale based on node position
    node_colors = px.colors.sequential.Blues[::int(len(px.colors.sequential.Blues)/len(nodes))]
    
    # Create Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=20,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=nodes,
            color=node_colors,
            hovertemplate='Node: %{label}<br>Total Flow: %{value}<extra></extra>'
        ),
        link=dict(
            source=[node_indices[link[0]] for link in links],
            target=[node_indices[link[1]] for link in links],
            value=link_values,
            hovertemplate='From: %{source.label}<br>To: %{target.label}<br>Flow: %{value}<extra></extra>'
        )
    )])
    
    fig.update_layout(
        title=dict(
            text="Customer Journey Paths Analysis",
            x=0.5,
            y=0.95,
            font=dict(size=16)
        ),
        font_size=12,
        height=800,
        width=1200,
        showlegend=True
    )
    
    return fig

def create_enhanced_journey_insights(timeline_df, journey_df, combined_df):
    """Create comprehensive journey visualizations"""
    if len(timeline_df) == 0 or len(journey_df) == 0:
        print("Warning: Empty data provided for visualization")
        return
    
    # 1. Journey Length Distribution with statistics
    plt.figure(figsize=(15, 6))
    
    # Create subplot grid
    gs = plt.GridSpec(1, 2, width_ratios=[2, 1])
    
    # Journey length histogram
    ax1 = plt.subplot(gs[0])
    sns.histplot(data=journey_df, x='length', bins=20, ax=ax1)
    ax1.set_title('Distribution of Journey Lengths')
    ax1.set_xlabel('Number of Products')
    ax1.set_ylabel('Number of Customers')
    
    # Add statistics table
    ax2 = plt.subplot(gs[1])
    stats = journey_df['length'].describe()
    ax2.axis('tight')
    ax2.axis('off')
    table_data = [[f"{k}: {v:.2f}"] for k, v in stats.items()]
    ax2.table(cellText=table_data, 
              colLabels=['Journey Length Statistics'],
              cellLoc='left',
              loc='center',
              bbox=[0.1, 0.2, 0.8, 0.6])
    
    plt.tight_layout()
    plt.show()
    
    # 2. Enhanced Product Adoption Timeline
    plt.figure(figsize=(15, 8))
    
    # Create main scatter plot
    sns.scatterplot(data=timeline_df, 
                   x='acquisition_date', 
                   y='product', 
                   alpha=0.6,
                   hue='product',
                   size=timeline_df.groupby('product').transform('count'),
                   sizes=(100, 400))
    
    # Add trend lines
    for product in timeline_df['product'].unique():
        product_data = timeline_df[timeline_df['product'] == product]
        z = np.polyfit(product_data['acquisition_date'].astype(np.int64), 
                      product_data.index, 1)
        p = np.poly1d(z)
        plt.plot(product_data['acquisition_date'], 
                p(product_data['acquisition_date'].astype(np.int64)), 
                '--', alpha=0.5)
    
    plt.title('Product Adoption Timeline with Trends')
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
    
    # 3. Enhanced Product Correlation Heatmap
    plt.figure(figsize=(15, 12))
    
    # Get product correlations
    have_cols = [col for col in combined_df.columns if col.startswith('Have_')]
    corr_matrix = combined_df[have_cols].corr()
    
    # Create mask for upper triangle
    mask = np.triu(np.ones_like(corr_matrix))
    
    # Create heatmap with enhanced styling
    sns.heatmap(corr_matrix, 
                mask=mask,
                annot=True, 
                cmap='RdYlBu',
                center=0,
                fmt='.2f',
                square=True,
                linewidths=0.5,
                cbar_kws={"shrink": .5})
    
    # Clean up labels
    labels = [col.replace('Have_', '') for col in have_cols]
    plt.xticks(np.arange(len(labels)) + 0.5, labels, rotation=45, ha='right')
    plt.yticks(np.arange(len(labels)) + 0.5, labels, rotation=0)
    
    plt.title('Product Adoption Correlation Heatmap')
    plt.tight_layout()
    plt.show()
    
    # 4. Add Journey Flow Analysis
    journey_lengths = journey_df['length'].value_counts().sort_index()
    product_flows = []
    
    for length in journey_lengths.index:
        journeys = journey_df[journey_df['length'] == length]
        flows = journeys['sequence'].value_counts().head(5)
        product_flows.append({
            'length': length,
            'top_flows': flows
        })
    
    print("\nMost Common Journey Flows by Length:")
    for flow in product_flows:
        print(f"\nJourneys with {flow['length']} products ({journey_lengths[flow['length']]} customers):")
        for path, count in flow['top_flows'].items():
            print(f"  {path}: {count} customers")

# Create visualizations
sankey_fig = plot_customer_journey_sankey(journey_df, max_paths=30, min_customers=5)
sankey_fig.show()
create_enhanced_journey_insights(timeline_df, journey_df, combined_df)

In [None]:
# Product Adoption Timeline
def plot_product_adoption_timeline(df):
    """Plot timeline using valid dates only"""
    timeline_data = analyze_product_sequence(df)
    
    # Filter out any remaining invalid dates (shouldn't be any)
    timeline_data = timeline_data[pd.notna(timeline_data['acquisition_date'])]
    
    fig = px.scatter(timeline_data, 
                    x='acquisition_date', 
                    y='product',
                    color='product',
                    title='Product Adoption Timeline')
    fig.update_layout(height=600)
    fig.show()
    
plot_product_adoption_timeline(combined_df)

In [None]:
def analyze_starter_products(journey_df, combined_df):
    """Analyze the most common starter products and their subsequent journeys"""
    
    # Get top 3 starter products
    top_starters = journey_df['first_product'].value_counts().head(3)
    
    # For each top starter, analyze the typical journey
    starter_insights = {}
    for product in top_starters.index:
        # Get customers who started with this product
        starter_journeys = journey_df[journey_df['first_product'] == product]
        
        # Get customer IDs for demographic analysis
        customer_ids = starter_journeys.index
        customer_data = combined_df[combined_df['sCustomerNaturalKey'].isin(customer_ids)]
        
        insights = {
            'total_customers': len(starter_journeys),
            'avg_journey_length': starter_journeys['length'].mean(),
            'avg_journey_duration': starter_journeys['duration_days'].mean(),
            'common_next_products': starter_journeys[starter_journeys['length'] > 1]['sequence'].apply(
                lambda x: x.split(' → ')[1] if ' → ' in x else None
            ).value_counts().head(3),
            'customer_profile': {
                'avg_age': customer_data['Age'].mean(),
                'pct_women': (customer_data['Woman'] == 1).mean() * 100,
                'pct_apartment': (customer_data['Apartment'] == 1).mean() * 100,
                'common_lifestyle': customer_data['LifestyleGroupCode'].mode().iloc[0]
            }
        }
        starter_insights[product] = insights
    
    return starter_insights, top_starters

def visualize_starter_product_journeys(journey_df, top_starters):
    """Create visualizations for top starter product journeys"""
    
    # 1. Sankey diagram for each starter product
    for product in top_starters.index:
        starter_journeys = journey_df[journey_df['first_product'] == product]
        
        # Create Sankey for this starter product
        fig = plot_customer_journey_sankey(starter_journeys, max_paths=5)
        fig.update_layout(title=f"Customer Journeys Starting with {product}")
        fig.show()
    
    # 2. Journey length comparison
    plt.figure(figsize=(12, 6))
    journey_lengths = []
    for product in top_starters.index:
        lengths = journey_df[journey_df['first_product'] == product]['length']
        journey_lengths.append(lengths)
    
    plt.boxplot(journey_lengths, labels=top_starters.index)
    plt.title('Journey Lengths by Starter Product')
    plt.ylabel('Number of Products')
    plt.xticks(rotation=45)
    plt.show()
    
    # 3. Time to second product
    plt.figure(figsize=(12, 6))
    time_to_second = []
    labels = []
    for product in top_starters.index:
        starter_journeys = journey_df[journey_df['first_product'] == product]
        multi_product = starter_journeys[starter_journeys['length'] > 1]
        if len(multi_product) > 0:
            time_to_second.append(multi_product['duration_days'] / multi_product['length'])
            labels.append(product)
    
    plt.boxplot(time_to_second, labels=labels)
    plt.title('Time to Second Product by Starter Product')
    plt.ylabel('Days')
    plt.xticks(rotation=45)
    plt.show()

starter_insights, top_starters = analyze_starter_products(journey_df, combined_df)
print("\nTop Starter Product Insights:")
for product, insights in starter_insights.items():
    print(f"\n{product}:")
    print(f"Total Customers: {insights['total_customers']:,}")
    print(f"Average Journey Length: {insights['avg_journey_length']:.2f} products")
    print(f"Average Journey Duration: {insights['avg_journey_duration']:.1f} days")
    print("\nCommon Next Products:")
    print(insights['common_next_products'])
    print("\nCustomer Profile:")
    for key, value in insights['customer_profile'].items():
        print(f"  {key}: {value}")

# Create visualizations
visualize_starter_product_journeys(journey_df, top_starters)

In [None]:
def analyze_customer_segments(journey_df, combined_df):
    """Analyze customer segments based on their journeys"""
    
    # Create customer segments
    segments = pd.DataFrame()
    segments['journey_length'] = journey_df['length']
    segments['journey_duration'] = journey_df['duration_days']
    segments['first_product'] = journey_df['first_product']
    
    # Add customer demographics
    segments['age'] = combined_df.set_index('sCustomerNaturalKey')['Age']
    segments['is_woman'] = combined_df.set_index('sCustomerNaturalKey')['Woman']
    segments['lifestyle'] = combined_df.set_index('sCustomerNaturalKey')['LifestyleGroupCode']
    
    # Create segment labels
    segments['segment'] = pd.qcut(segments['journey_length'], q=3, labels=['Basic', 'Moderate', 'Extensive'])
    
    # Analyze segments
    segment_insights = {}
    for segment in segments['segment'].unique():
        segment_data = segments[segments['segment'] == segment]
        insights = {
            'size': len(segment_data),
            'avg_products': segment_data['journey_length'].mean(),
            'avg_duration': segment_data['journey_duration'].mean(),
            'common_starter': segment_data['first_product'].mode().iloc[0],
            'avg_age': segment_data['age'].mean(),
            'pct_women': (segment_data['is_woman'] == 1).mean() * 100,
            'common_lifestyle': segment_data['lifestyle'].mode().iloc[0]
        }
        segment_insights[segment] = insights
    
    return segment_insights, segments

# Analyze customer segments
segment_insights, segments = analyze_customer_segments(journey_df, combined_df)
print("\nCustomer Segment Insights:")
for segment, insights in segment_insights.items():
    print(f"\n{segment} Segment:")
    for key, value in insights.items():
        print(f"  {key}: {value}")

## Additional Insights

In [None]:
# Journey Pattern Analysis
def analyze_journey_patterns(journey_df, combined_df):
    """Analyze patterns in customer journeys"""
    patterns = {
        'journey_stats': {
            'total_customers': len(journey_df),
            'avg_products': journey_df['length'].mean(),
            'avg_duration': journey_df['duration_days'].mean(),
            'common_first': journey_df['first_product'].value_counts().head(),
            'common_last': journey_df['last_product'].value_counts().head()
        },
        'journey_segments': {
            'single_product': (journey_df['length'] == 1).mean(),
            'short_journey': ((journey_df['length'] > 1) & (journey_df['length'] <= 3)).mean(),
            'long_journey': (journey_df['length'] > 3).mean()
        }
    }
    
    return patterns

# Analyze patterns
patterns = analyze_journey_patterns(journey_df, combined_df)
print("\nJourney Analysis Results:")
for category, stats in patterns.items():
    print(f"\n{category.replace('_', ' ').title()}:")
    for key, value in stats.items():
        print(f"  {key}: {value}")

In [None]:
# Product combinations analysis
def analyze_product_combinations(df):
    """Analyze which products are commonly held together"""
    have_cols = [col for col in df.columns if col.startswith('Have_')]
    product_combinations = df[have_cols].sum()
    
    # Create correlation matrix
    corr_matrix = df[have_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Product Combination Correlations')
    plt.xticks(rotation=45)
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return product_combinations

print("Product ownership analysis:")
display(analyze_product_combinations(combined_df))

In [None]:
def create_demographic_insights(journey_df, combined_df):
    """Create detailed demographic visualizations for different journey types"""
    
    # 1. Age distribution by journey length
    plt.figure(figsize=(12, 6))
    journey_length_bins = pd.qcut(journey_df['length'], q=3, labels=['Short', 'Medium', 'Long'])
    combined_df['journey_length_category'] = journey_length_bins
    
    sns.boxplot(data=combined_df, x='journey_length_category', y='Age')
    plt.title('Age Distribution by Journey Length')
    plt.show()
    
    # 2. Product preferences by gender
    plt.figure(figsize=(12, 6))
    gender_prefs = pd.DataFrame()
    have_cols = [col for col in combined_df.columns if col.startswith('Have_')]
    
    for col in have_cols:
        gender_prefs[col] = combined_df.groupby('Woman')[col].mean()
    
    gender_prefs = gender_prefs.T
    gender_prefs.columns = ['Men', 'Women']
    gender_prefs.plot(kind='bar')
    plt.title('Product Preferences by Gender')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # 3. Lifestyle group analysis
    plt.figure(figsize=(12, 6))
    lifestyle_journey = combined_df.groupby('LifestyleGroupCode')['journey_length_category'].value_counts(normalize=True).unstack()
    lifestyle_journey.plot(kind='bar', stacked=True)
    plt.title('Journey Lengths by Lifestyle Group')
    plt.xlabel('Lifestyle Group')
    plt.ylabel('Proportion')
    plt.legend(title='Journey Length')
    plt.tight_layout()
    plt.show()

# Visualize insights
create_demographic_insights(journey_df, combined_df)

In [None]:
def visualize_customer_segments(segments):
    """Create visualizations for customer segments"""
    
    # 1. Segment size comparison
    plt.figure(figsize=(10, 6))
    segments['segment'].value_counts().plot(kind='bar')
    plt.title('Size of Customer Segments')
    plt.xlabel('Segment')
    plt.ylabel('Number of Customers')
    plt.show()
    
    # 2. Age distribution by segment
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=segments, x='segment', y='age')
    plt.title('Age Distribution by Customer Segment')
    plt.show()
    
    # 3. Product mix by segment
    plt.figure(figsize=(12, 6))
    segments.groupby('segment')['first_product'].value_counts(normalize=True).unstack().plot(kind='bar', stacked=True)
    plt.title('First Product Distribution by Segment')
    plt.xlabel('Segment')
    plt.ylabel('Proportion')
    plt.legend(title='First Product', bbox_to_anchor=(1.05, 1))
    plt.tight_layout()
    plt.show()

# Visualize insights
visualize_customer_segments(segments)

In [None]:
def analyze_retention_patterns(journey_df, combined_df):
    """Analyze customer retention patterns"""
    retention_data = pd.DataFrame()
    
    # Time between purchases
    retention_data['avg_purchase_gap'] = journey_df['duration_days'] / (journey_df['length'] - 1)
    
    # Product stickiness
    have_cols = [col for col in combined_df.columns if col.startswith('Have_')]
    had_cols = [col for col in combined_df.columns if col.startswith('Had_')]
    
    for have, had in zip(have_cols, had_cols):
        product = have.replace('Have_', '')
        retention_data[f'{product}_retention'] = combined_df[have] / combined_df[had]
    
    return retention_data

In [None]:
def calculate_engagement_score(journey_df, combined_df):
    """Calculate customer engagement scores based on multiple factors"""
    engagement = pd.DataFrame()
    
    # Product diversity score
    engagement['product_diversity'] = journey_df['length'] / len([col for col in combined_df.columns if col.startswith('Have_')])
    
    # Engagement speed
    engagement['engagement_speed'] = journey_df['length'] / journey_df['duration_days'].clip(1)
    
    # Activity score
    engagement['activity_score'] = combined_df[[col for col in combined_df.columns if col.startswith('nbr_active_agr_')]].sum(axis=1)
    
    # Overall score
    engagement['total_score'] = (engagement['product_diversity'] + 
                               engagement['engagement_speed'] + 
                               engagement['activity_score']).rank(pct=True)
    
    return engagement

In [None]:
def analyze_temporal_patterns(journey_df, timeline_df):
    """Analyze how customer journeys evolve over time"""
    # Seasonal patterns
    timeline_df['month'] = timeline_df['acquisition_date'].dt.month
    timeline_df['year'] = timeline_df['acquisition_date'].dt.year
    
    # Time between products
    journey_df['avg_time_between_products'] = journey_df['duration_days'] / (journey_df['length'] - 1)
    journey_df['avg_time_between_products'] = journey_df['avg_time_between_products'].fillna(0)
    
    # Product velocity
    yearly_patterns = timeline_df.groupby(['year', 'product']).size().unstack(fill_value=0)
    
    return {
        'seasonal_patterns': timeline_df.groupby('month')['product'].value_counts(),
        'yearly_trends': yearly_patterns,
        'avg_acquisition_speed': journey_df['avg_time_between_products'].describe()
    }

In [None]:
def analyze_customer_value(journey_df, combined_df):
    """Analyze relationship between journey patterns and customer value"""
    value_analysis = pd.DataFrame()
    value_analysis['journey_length'] = journey_df['length']
    value_analysis['total_products'] = journey_df['length']
    value_analysis['journey_duration'] = journey_df['duration_days']
    
    # Add product counts
    have_cols = [col for col in combined_df.columns if col.startswith('Have_')]
    value_analysis['total_active_products'] = combined_df[have_cols].sum(axis=1)
    
    # Analyze cross-sell success
    value_analysis['cross_sell_ratio'] = value_analysis['total_active_products'] / value_analysis['journey_length']
    
    return value_analysis

In [None]:
def analyze_customer_lifetime_value(journey_df, combined_df):
    """Analyze customer lifetime value based on product portfolio"""
    value_analysis = pd.DataFrame()
    
    # Portfolio size value
    value_analysis['portfolio_value'] = journey_df['length']
    
    # Product mix value
    have_cols = [col for col in combined_df.columns if col.startswith('Have_')]
    value_analysis['product_mix'] = combined_df[have_cols].sum(axis=1)
    
    # Engagement duration value
    value_analysis['engagement_duration'] = journey_df['duration_days']
    
    return value_analysis

In [None]:
def analyze_product_affinities(combined_df):
    """Analyze detailed product relationships and purchase patterns"""
    have_cols = [col for col in combined_df.columns if col.startswith('Have_')]
    
    # Product co-occurrence
    product_pairs = []
    for i, prod1 in enumerate(have_cols):
        for prod2 in have_cols[i+1:]:
            together = ((combined_df[prod1] == 1) & (combined_df[prod2] == 1)).sum()
            total_prod1 = (combined_df[prod1] == 1).sum()
            total_prod2 = (combined_df[prod2] == 1).sum()
            
            if total_prod1 > 0 and total_prod2 > 0:
                lift = (together / len(combined_df)) / ((total_prod1 / len(combined_df)) * (total_prod2 / len(combined_df)))
                product_pairs.append({
                    'product1': prod1.replace('Have_', ''),
                    'product2': prod2.replace('Have_', ''),
                    'together_count': together,
                    'lift': lift
                })
    
    return pd.DataFrame(product_pairs)

In [None]:
def analyze_lifecycle_stages(journey_df, combined_df):
    """Analyze customer lifecycle stages and transitions"""
    lifecycle_data = pd.DataFrame()
    
    # Define lifecycle stages
    lifecycle_data['stage'] = pd.cut(journey_df['length'], 
                                   bins=[0, 1, 3, 5, float('inf')],
                                   labels=['New', 'Growing', 'Established', 'Mature'])
    
    # Time in each stage
    lifecycle_data['time_in_stage'] = journey_df['duration_days']
    
    # Product adoption rate
    lifecycle_data['adoption_rate'] = journey_df['length'] / journey_df['duration_days']
    lifecycle_data['adoption_rate'] = lifecycle_data['adoption_rate'].fillna(0)
    
    return lifecycle_data

In [None]:
def analyze_churn_risk(journey_df, combined_df):
    """Analyze potential churn indicators in customer journeys"""
    risk_factors = pd.DataFrame()
    
    # Time since last product
    current_date = combined_df['mFirst_BankBolan'].max()  # Use as reference date
    # Using the last product's acquisition date
    for idx in journey_df.index:
        last_product = journey_df.loc[idx, 'last_product']
        last_date = timeline_df[timeline_df['product'] == last_product]['acquisition_date'].max()
        risk_factors.loc[idx, 'days_since_last_product'] = (current_date - last_date).days
    
    # Product discontinuation
    had_cols = [col for col in combined_df.columns if col.startswith('Had_')]
    have_cols = [col.replace('Had_', 'Have_') for col in had_cols]
    
    risk_factors['discontinued_products'] = 0
    for had, have in zip(had_cols, have_cols):
        risk_factors['discontinued_products'] += (combined_df[had] > combined_df[have]).astype(int)
    
    return risk_factors

In [None]:
def create_comprehensive_journey_visualization(journey_df, timeline_df, combined_df):
    """Create comprehensive journey visualizations including all aspects"""
    # First create and display Sankey diagram
    sankey_fig = plot_customer_journey_sankey(journey_df)
    sankey_fig.show()

    # create matplotlib visualizations
    fig = plt.figure(figsize=(20, 16))
    gs = fig.add_gridspec(3, 2)
    
    # 2. Temporal Patterns
    ax2 = fig.add_subplot(gs[1, 0])
    temporal_patterns = analyze_temporal_patterns(journey_df, timeline_df)
    sns.heatmap(temporal_patterns['yearly_trends'], cmap='YlOrRd')
    
    # 3. Customer Value
    ax3 = fig.add_subplot(gs[1, 1])
    value_analysis = analyze_customer_value(journey_df, combined_df)
    sns.scatterplot(data=value_analysis, x='journey_length', y='total_active_products')
    
    # 4. Product Affinities
    ax4 = fig.add_subplot(gs[2, :])
    affinities = analyze_product_affinities(combined_df)
    sns.heatmap(affinities.pivot('product1', 'product2', 'lift'), annot=True)
    
    # 5. Lifecycle Stages
    ax5 = fig.add_subplot(gs[3, 0])
    lifecycle_data = analyze_lifecycle_stages(journey_df, combined_df)
    sns.boxplot(data=lifecycle_data, x='stage', y='adoption_rate')
    
    # 6. Churn Risk
    ax6 = fig.add_subplot(gs[3, 1])
    risk_data = analyze_churn_risk(journey_df, combined_df)
    sns.histplot(data=risk_data, x='days_since_last_product')
    
    plt.tight_layout()
    return fig

create_comprehensive_journey_visualization(journey_df, timeline_df, combined_df)

## Optional: Predictive Modeling