# Solution: The Hidden Segments

This is the answer key for `drill_08_regression_segments.ipynb`.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

np.random.seed(42)

In [None]:
# Generate synthetic customer data with hidden segment structure
n_samples = 1000

segments = np.random.choice(['Standard', 'Premium', 'Enterprise'], n_samples, p=[0.7, 0.2, 0.1])

tenure_months = np.random.uniform(1, 36, n_samples)
monthly_spend = np.random.uniform(20, 200, n_samples)
orders = np.random.poisson(5, n_samples)

ltv = np.zeros(n_samples)

for i, seg in enumerate(segments):
    if seg == 'Standard':
        ltv[i] = 50 + 10 * tenure_months[i] + 2 * monthly_spend[i] + np.random.normal(0, 30)
    elif seg == 'Premium':
        ltv[i] = 200 + 5 * tenure_months[i]**1.5 + 3 * monthly_spend[i] + np.random.normal(0, 50)
    else:
        ltv[i] = 1000 + 50 * tenure_months[i] + 10 * monthly_spend[i] + np.random.normal(0, 200)

df = pd.DataFrame({
    'tenure_months': tenure_months,
    'monthly_spend': monthly_spend,
    'orders': orders,
    'segment': segments,
    'ltv': ltv
})

In [None]:
# Original (buggy) model
X = df[['tenure_months', 'monthly_spend', 'orders']]
y = df['ltv']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## SOLUTION: Segment-Specific Models

In [None]:
# SOLUTION: Train separate models per segment
print("=== Segment-Specific Models ===")

y_pred_fixed = pd.Series(index=X_test.index, dtype=float)

for seg in ['Standard', 'Premium', 'Enterprise']:
    # Train on segment
    train_mask = df.loc[X_train.index, 'segment'] == seg
    test_mask = df.loc[X_test.index, 'segment'] == seg
    
    if train_mask.sum() > 10:
        seg_model = LinearRegression()
        seg_model.fit(X_train[train_mask], y_train[train_mask])
        
        seg_pred = seg_model.predict(X_test[test_mask])
        y_pred_fixed[test_mask] = seg_pred
        
        mae = mean_absolute_error(y_test[test_mask], seg_pred)
        r2 = r2_score(y_test[test_mask], seg_pred)
        print(f"  {seg}: MAE=${mae:.2f}, R²={r2:.3f}")

print(f"\nOverall (segment models): MAE=${mean_absolute_error(y_test, y_pred_fixed):.2f}")
print(f"Overall (single model):   MAE=${mean_absolute_error(y_test, y_pred):.2f}")

In [None]:
# Verify improvement
mae_original = mean_absolute_error(y_test, y_pred)
mae_fixed = mean_absolute_error(y_test, y_pred_fixed)

assert mae_fixed < mae_original, f"Segment models ({mae_fixed:.1f}) should beat single model ({mae_original:.1f})"

print("✓ Segment-specific models improved overall MAE!")
print(f"  Original: ${mae_original:.2f}")
print(f"  Fixed: ${mae_fixed:.2f}")
print(f"  Improvement: {(mae_original - mae_fixed) / mae_original * 100:.1f}%")

## Sample Postmortem

### What happened:
- Marketing complained predictions for premium/enterprise customers were inaccurate, despite good overall R².

### Root cause:
- Different customer segments have fundamentally different LTV patterns. Standard customers are linear, Premium have quadratic growth, Enterprise have different scale. A single linear model can't capture all three.

### How to prevent:
- Always analyze performance by relevant business segments before deployment.
- When segments have different patterns, consider segment-specific models or features that capture segment interactions.