# Customer Journey Analysis
# This notebook analyzes customer journeys across different products, visualizing patterns in purchasing behavior, demographics, and product adoption sequences.

## Import and plot-style

In [2]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pathlib import Path
import jax
import jax.numpy as jnp
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go

# Set style for better visualizations
plt.style.use("seaborn-v0_8-dark-palette")
sns.set_palette("husl")
%matplotlib inline


## Data Loading

In [None]:
# We'll load all ABT_score files and combine them with appropriate target labels.
def load_abt_files():
    """Load all ABT_score files and combine them with appropriate target labels"""
    abt_files = Path('.').glob('ABT_score_*.csv') # Change this path to the location/name of your files.
    dfs = []
    
    for file_path in abt_files:
        product = file_path.stem.split('_')[-1]
        print(f"Loading {product} data...")
        df = pd.read_csv(file_path, sep=';')
        df['product_type'] = product
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)

# Load the data
combined_df = load_abt_files()

# Display basic information about the dataset
print("\nDataset Overview:")
print(f"Total number of records: {len(combined_df)}")
print("\nProduct distribution:")
print(combined_df['product_type'].value_counts())


## Data Preprocessing

In [None]:
# Clean the data and prepare it for analysis.
def preprocess_data(df):
    """Clean and preprocess the combined dataset"""
    # Convert date columns to datetime
    date_columns = [col for col in df.columns if 'Date' in col or 'date' in col]
    for col in date_columns:
        df[col] = pd.to_datetime(df[col])
    
    # Convert binary columns to int
    binary_columns = [col for col in df.columns if col.startswith('Have_') or 
                     col.startswith('Had_') or 
                     col.startswith('Optout_')]
    for col in binary_columns:
        df[col] = df[col].astype(int)
    
    return df

combined_df = preprocess_data(combined_df)

# Display sample of preprocessed data
print("Sample of preprocessed data:")
display(combined_df.head())


## Customer Journey Analysis

In [None]:
# Analyze the sequence of products purchased by customers.
def analyze_product_sequence(df):
    """Analyze the sequence of products purchased by customers"""
    product_cols = [col for col in df.columns if col.startswith('mFirst_')]
    
    # Create a timeline of product acquisitions
    product_timeline = pd.DataFrame()
    for col in product_cols:
        product = col.replace('mFirst_', '')
        mask = ~df[col].isna()
        if mask.any():
            product_timeline = pd.concat([
                product_timeline,
                pd.DataFrame({
                    'sCustomerNaturalKey': df.loc[mask, 'sCustomerNaturalKey'],
                    'product': product,
                    'acquisition_date': df.loc[mask, col]
                })
            ])
    
    return product_timeline.sort_values('acquisition_date')

# Analyze product sequences
product_timeline = analyze_product_sequence(combined_df)

# Display summary of product sequences
print("Most common first products:")
display(product_timeline.groupby('sCustomerNaturalKey')
        .first()['product']
        .value_counts()
        .head(10))

## Visualizations

In [None]:
# Customer Journey Sankey Diagram
def plot_customer_journey_sankey(df):
    """Create a Sankey diagram of customer journeys"""
    product_sequence = analyze_product_sequence(df)
    
    # Group by customer and create product sequences
    customer_sequences = product_sequence.groupby('sCustomerNaturalKey').agg(
        list
    )['product'].value_counts().head(10)  # Top 10 most common sequences
    
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = customer_sequences.index,
            color = "blue"
        ),
        link = dict(
            source = [i for i in range(len(customer_sequences)-1)],
            target = [i+1 for i in range(len(customer_sequences)-1)],
            value = customer_sequences.values[:-1]
        )
    )])
    
    fig.update_layout(title_text="Most Common Customer Journey Paths", 
                     font_size=10,
                     height=600)
    fig.show()

# Create Sankey diagram
plot_customer_journey_sankey(combined_df)


In [None]:
# Demographic Analysis
def plot_demographic_distribution(df):
    """Plot age and gender distribution for different products"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Age distribution
    sns.boxplot(x='product_type', y='Age', data=df, ax=ax1)
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45)
    ax1.set_title('Age Distribution by Product')
    
    # Gender distribution
    gender_dist = df.groupby(['product_type', 'Woman']).size().unstack()
    gender_dist.plot(kind='bar', stacked=True, ax=ax2)
    ax2.set_title('Gender Distribution by Product')
    ax2.legend(['Male', 'Female'])
    
    plt.tight_layout()
    plt.show()
    
    # Additional demographic insights
    print("\nMean age by product:")
    display(df.groupby('product_type')['Age'].mean().sort_values(ascending=False))

plot_demographic_distribution(combined_df)

In [None]:
# Product Adoption Timeline
def plot_product_adoption_timeline(df):
    """Plot timeline of product adoption"""
    timeline_data = analyze_product_sequence(df)
    
    fig = px.scatter(timeline_data, 
                    x='acquisition_date', 
                    y='product',
                    color='product',
                    title='Product Adoption Timeline')
    
    fig.update_layout(height=600)
    fig.show()
    
    # Additional timeline insights
    print("\nMedian time between first and second product (days):")
    customer_products = timeline_data.groupby('sCustomerNaturalKey')
    time_between = customer_products.acquisition_date.agg(lambda x: x.diff().median().days)
    display(time_between.median())

plot_product_adoption_timeline(combined_df)


## Optional: Predictive Modeling

In [None]:
# We can use PyTorch to build a model predicting future product adoption.
class CustomerJourneyPredictor(torch.nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(input_size, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(64, 1),
            torch.nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.layers(x)

def prepare_features(df):
    """Prepare features for the prediction model"""
    feature_cols = [col for col in df.columns if col.startswith(('Have_', 'Had_', 'nbr_active_agr_'))]
    X = df[feature_cols]
    y = df['myTarget']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return torch.FloatTensor(X_scaled), torch.FloatTensor(y.values)

In [None]:
# Prepare data and initialize model
X, y = prepare_features(combined_df)
model = CustomerJourneyPredictor(X.shape[1])
print("Model architecture:")
print(model)

## Additional Insights

In [None]:
# Product combinations analysis
def analyze_product_combinations(df):
    """Analyze which products are commonly held together"""
    have_cols = [col for col in df.columns if col.startswith('Have_')]
    product_combinations = df[have_cols].sum()
    
    # Create correlation matrix
    corr_matrix = df[have_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Product Combination Correlations')
    plt.xticks(rotation=45)
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return product_combinations

print("Product ownership analysis:")
display(analyze_product_combinations(combined_df))