# Survey to Operational Metrics - Demo with Synthetic Data

This notebook generates synthetic customer survey and operational data to demonstrate correlation analysis techniques.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## 1. Configuration Parameters

Define configurable parameters for all synthetic data generation.

In [None]:
# Survey configuration
SURVEY_CONFIG = {
    'start_date': '2020-01-01',
    'end_date': '2023-12-31',
    'avg_responses_per_month': 2000,
    'response_std': 200,  # Standard deviation in monthly responses
    'total_customer_pool': 50000,  # Total unique customers
    'metrics': {
        'overall_satisfaction': {'min': 1, 'max': 10, 'mean': 7.0, 'std': 2.0},
        'product_quality': {'min': 1, 'max': 15, 'mean': 10.5, 'std': 3.0},
        'customer_service': {'min': 1, 'max': 15, 'mean': 11.0, 'std': 2.8},
        'ease_of_use': {'min': 1, 'max': 15, 'mean': 10.0, 'std': 3.2},
        'value_for_money': {'min': 1, 'max': 15, 'mean': 9.5, 'std': 3.5},
        'likelihood_to_recommend': {'min': 1, 'max': 15, 'mean': 10.2, 'std': 3.3}
    }
}

# Operational metrics configuration
OPS_CONFIG = {
    'num_calls': {
        'pct_no_calls': 0.65,  # 65% of customers made no calls
        'pct_1_call': 0.20,     # 20% made 1 call
        'pct_2_5_calls': 0.12,  # 12% made 2-5 calls
        'pct_6_plus_calls': 0.03,  # 3% made 6+ calls
        'max_calls_heavy_users': 25  # Max calls for heavy users
    },
    'login_frequency': {
        'mean': 15.0,  # Average logins per month
        'std': 10.0,
        'min': 0,
        'max': 90  # ~3 per day max
    },
    'feature_usage_count': {
        'mean': 25.0,  # Average feature interactions per month
        'std': 15.0,
        'min': 0,
        'max': 200
    },
    'session_duration_minutes': {
        'mean': 120.0,  # Total minutes spent per month
        'std': 80.0,
        'min': 0,
        'max': 600  # 10 hours max
    }
}

print("Configuration loaded successfully!")

## 2. Generate Survey Response Data

In [None]:
def generate_survey_data(config):
    """
    Generate synthetic survey response data.
    
    Returns:
        DataFrame with customer_id, survey_date, and response metrics
    """
    start = pd.to_datetime(config['start_date'])
    end = pd.to_datetime(config['end_date'])
    
    # Generate date range (monthly)
    months = pd.date_range(start=start, end=end, freq='MS')
    
    all_responses = []
    
    for month in months:
        # Determine number of responses this month
        n_responses = int(np.random.normal(
            config['avg_responses_per_month'], 
            config['response_std']
        ))
        n_responses = max(100, n_responses)  # At least 100 responses
        
        # Randomly sample customer IDs
        customer_ids = np.random.choice(
            range(1, config['total_customer_pool'] + 1),
            size=n_responses,
            replace=False
        )
        
        # Generate random dates within the month
        days_in_month = (month + pd.DateOffset(months=1) - timedelta(days=1)).day
        survey_dates = [month + timedelta(days=np.random.randint(0, days_in_month)) 
                       for _ in range(n_responses)]
        
        month_data = {'customer_id': customer_ids, 'survey_date': survey_dates}
        
        # Generate responses for each metric
        for metric_name, metric_config in config['metrics'].items():
            responses = np.random.normal(
                metric_config['mean'],
                metric_config['std'],
                n_responses
            )
            # Clip to valid range and round
            responses = np.clip(responses, metric_config['min'], metric_config['max'])
            responses = np.round(responses).astype(int)
            month_data[metric_name] = responses
        
        all_responses.append(pd.DataFrame(month_data))
    
    df = pd.concat(all_responses, ignore_index=True)
    df = df.sort_values('survey_date').reset_index(drop=True)
    
    return df

# Generate the data
survey_df = generate_survey_data(SURVEY_CONFIG)

print(f"Generated {len(survey_df):,} survey responses")
print(f"Date range: {survey_df['survey_date'].min()} to {survey_df['survey_date'].max()}")
print(f"\nFirst few rows:")
survey_df.head(10)

## 3. Generate Operational Metrics Data

In [None]:
def generate_num_calls(customer_ids, config):
    """
    Generate num_calls based on configured distribution.
    """
    n = len(customer_ids)
    calls = np.zeros(n, dtype=int)
    
    # Assign call counts based on percentages
    idx = 0
    
    # No calls
    n_no_calls = int(n * config['pct_no_calls'])
    calls[idx:idx + n_no_calls] = 0
    idx += n_no_calls
    
    # 1 call
    n_1_call = int(n * config['pct_1_call'])
    calls[idx:idx + n_1_call] = 1
    idx += n_1_call
    
    # 2-5 calls
    n_2_5_calls = int(n * config['pct_2_5_calls'])
    calls[idx:idx + n_2_5_calls] = np.random.randint(2, 6, n_2_5_calls)
    idx += n_2_5_calls
    
    # 6+ calls (heavy users)
    n_6_plus = n - idx
    calls[idx:] = np.random.randint(6, config['max_calls_heavy_users'] + 1, n_6_plus)
    
    # Shuffle to randomize
    np.random.shuffle(calls)
    
    return calls


def generate_continuous_metric(customer_ids, config):
    """
    Generate a continuous metric with normal distribution.
    """
    n = len(customer_ids)
    values = np.random.normal(config['mean'], config['std'], n)
    values = np.clip(values, config['min'], config['max'])
    return np.round(values, 1)


def generate_operational_data_with_correlations(survey_df, ops_config):
    """
    Generate operational metrics with realistic correlations to survey responses.
    
    Key correlations:
    - More calls -> Lower customer_service score (negative)
    - Higher usage (logins, features, duration) -> Higher satisfaction (positive)
    - These create realistic patterns for analysis
    
    Returns:
        DataFrame with customer_id, survey_date, and operational metrics
    """
    n = len(survey_df)
    customer_ids = survey_df['customer_id'].values
    survey_dates = survey_df['survey_date'].values
    
    # Generate base operational metrics
    base_calls = generate_num_calls(customer_ids, ops_config['num_calls'])
    base_logins = generate_continuous_metric(customer_ids, ops_config['login_frequency'])
    base_features = generate_continuous_metric(customer_ids, ops_config['feature_usage_count'])
    base_duration = generate_continuous_metric(customer_ids, ops_config['session_duration_minutes'])
    
    # Create correlated survey responses
    # Higher calls -> lower customer service (negative correlation ~-0.4)
    customer_service = survey_df['customer_service'].values
    customer_service_adjusted = customer_service - (base_calls * 0.3) + np.random.normal(0, 1.5, n)
    customer_service_adjusted = np.clip(customer_service_adjusted, 1, 15).round().astype(int)
    
    # Higher usage -> higher satisfaction (positive correlations ~0.3-0.5)
    overall_sat = survey_df['overall_satisfaction'].values
    usage_effect = (base_logins / 50 + base_features / 100 + base_duration / 300)
    overall_sat_adjusted = overall_sat + usage_effect + np.random.normal(0, 1.2, n)
    overall_sat_adjusted = np.clip(overall_sat_adjusted, 1, 10).round().astype(int)
    
    # Ease of use influenced by feature usage (moderate positive ~0.3)
    ease_of_use = survey_df['ease_of_use'].values
    ease_adjusted = ease_of_use + (base_features / 80) + np.random.normal(0, 2, n)
    ease_adjusted = np.clip(ease_adjusted, 1, 15).round().astype(int)
    
    ops_data = {
        'customer_id': customer_ids,
        'survey_date': survey_dates,
        'num_calls': base_calls,
        'login_frequency': base_logins,
        'feature_usage_count': base_features,
        'session_duration_minutes': base_duration
    }
    
    # Update survey responses with correlated versions
    survey_updates = {
        'overall_satisfaction': overall_sat_adjusted,
        'customer_service': customer_service_adjusted,
        'ease_of_use': ease_adjusted
    }
    
    return pd.DataFrame(ops_data), survey_updates


# Generate operational data with correlations
ops_df, survey_updates = generate_operational_data_with_correlations(survey_df, OPS_CONFIG)

# Update survey responses with correlated versions
for col, values in survey_updates.items():
    survey_df[col] = values

print(f"Generated operational metrics for {len(ops_df):,} customer-survey pairs")
print(f"Added realistic correlations:")
print(f"  - num_calls negatively affects customer_service")
print(f"  - Higher usage (logins, features, duration) positively affects overall_satisfaction")
print(f"  - feature_usage_count positively affects ease_of_use")
print(f"\nOperational metrics summary:")
ops_df.describe()

## 4. Combine Survey and Operational Data

In [None]:
# Merge survey responses with operational metrics
combined_df = survey_df.merge(
    ops_df,
    on=['customer_id', 'survey_date'],
    how='inner'
)

print(f"Combined dataset shape: {combined_df.shape}")
print(f"\nColumn names:")
print(combined_df.columns.tolist())
print(f"\nFirst few rows:")
combined_df.head()

## 5. Data Exploration

### 5.1 Survey Metrics Distribution

In [None]:
# Survey metrics distribution
survey_metrics = ['overall_satisfaction', 'product_quality', 'customer_service', 
                 'ease_of_use', 'value_for_money', 'likelihood_to_recommend']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i, metric in enumerate(survey_metrics):
    axes[i].hist(combined_df[metric], bins=20, edgecolor='black', alpha=0.7)
    axes[i].set_title(metric.replace('_', ' ').title())
    axes[i].set_xlabel('Score')
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.suptitle('Survey Metrics Distributions', y=1.02, fontsize=16)
plt.show()

### 5.2 Operational Metrics Distribution

# Define metrics for correlation analysis
survey_metrics = ['overall_satisfaction', 'product_quality', 'customer_service', 
                 'ease_of_use', 'value_for_money', 'likelihood_to_recommend']
ops_metrics = ['num_calls', 'login_frequency', 'feature_usage_count', 'session_duration_minutes']
metrics_for_corr = survey_metrics + ops_metrics

# Correlation heatmap (individual level)
corr_matrix = combined_df[metrics_for_corr].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Matrix: Survey and Operational Metrics (Individual Level)', fontsize=14, pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Calculate monthly aggregates
combined_df['year_month'] = combined_df['survey_date'].dt.to_period('M')

monthly_agg = combined_df.groupby('year_month')[metrics_for_corr].mean().reset_index()
monthly_agg['year_month'] = monthly_agg['year_month'].astype(str)

print(f"Aggregated to {len(monthly_agg)} months")
print(f"\nMonthly aggregated data (first 5 months):")
print(monthly_agg.head())

# Calculate correlation on monthly aggregates
monthly_corr_matrix = monthly_agg[metrics_for_corr].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(monthly_corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Matrix: Monthly Aggregated Metrics', fontsize=14, pad=20)
plt.tight_layout()
plt.show()

### 5.4 Correlation Analysis (Aggregated Monthly Means)

Aggregate data by month and calculate correlations between monthly averages.

In [None]:
# Operational metrics distribution
ops_metrics = ['num_calls', 'login_frequency', 'feature_usage_count', 'session_duration_minutes']

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()

for i, metric in enumerate(ops_metrics):
    axes[i].hist(combined_df[metric], bins=30, edgecolor='black', alpha=0.7)
    axes[i].set_title(metric.replace('_', ' ').title())
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.suptitle('Operational Metrics Distributions', y=1.02, fontsize=16)
plt.show()

In [None]:
# Correlation heatmap
metrics_for_corr = survey_metrics + ops_metrics
corr_matrix = combined_df[metrics_for_corr].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Matrix: Survey and Operational Metrics', fontsize=14, pad=20)
plt.tight_layout()
plt.show()

## 6. Export Data

In [None]:
# Save to CSV for further analysis
combined_df.to_csv('synthetic_survey_ops_data.csv', index=False)
print("Data exported to 'synthetic_survey_ops_data.csv'")
print(f"\nDataset summary:")
print(f"Total records: {len(combined_df):,}")
print(f"Date range: {combined_df['survey_date'].min()} to {combined_df['survey_date'].max()}")
print(f"Unique customers: {combined_df['customer_id'].nunique():,}")