# Solution: Debug Drill 04 - The Expensive Threshold

This is the solution notebook for the threshold cost optimization drill.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

np.random.seed(42)

In [None]:
# Load and prepare data
DATA_URL = 'https://raw.githubusercontent.com/189investmentai/ml-foundations-interactive/main/shared/data/'
customers = pd.read_csv(DATA_URL + 'streamcart_customers.csv')

if 'tenure_days' not in customers.columns:
    customers['tenure_days'] = (pd.to_datetime('2024-01-01') - pd.to_datetime(customers['signup_date'])).dt.days
if 'avg_order_value' not in customers.columns:
    customers['avg_order_value'] = customers['total_spend'] / customers['orders_total'].replace(0, 1)

feature_cols = ['tenure_days', 'orders_total', 'total_spend', 'support_tickets_total', 'avg_order_value']
available_features = [c for c in feature_cols if c in customers.columns]

X = customers[available_features].fillna(0)
y = customers['churn_30d']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
probabilities = model.predict_proba(X_test)[:, 1]

In [None]:
# Business costs
FP_COST = 50
FN_COST = 200

# Original (buggy) threshold
THRESHOLD = 0.5
predictions = (probabilities >= THRESHOLD).astype(int)

cm = confusion_matrix(y_test, predictions)
tn, fp, fn, tp = cm.ravel()
total_cost = fp * FP_COST + fn * FN_COST

print(f"Original cost at threshold 0.5: ${total_cost:,}")

In [None]:
# SOLUTION: Find optimal threshold

def calculate_cost(y_true, y_pred, fp_cost, fn_cost):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    return fp * fp_cost + fn * fn_cost

thresholds = np.arange(0.05, 0.95, 0.05)
results = []

for thresh in thresholds:
    preds = (probabilities >= thresh).astype(int)
    
    cm = confusion_matrix(y_test, preds)
    tn, fp, fn, tp = cm.ravel()
    
    cost = calculate_cost(y_test, preds, FP_COST, FN_COST)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds)
    
    results.append({
        'threshold': thresh,
        'fp': fp,
        'fn': fn,
        'cost': cost,
        'precision': prec,
        'recall': rec
    })

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

In [None]:
# Find optimal threshold
optimal_idx = results_df['cost'].idxmin()
optimal_row = results_df.loc[optimal_idx]

print(f"\n=== OPTIMAL THRESHOLD: {optimal_row['threshold']:.2f} ===")
print(f"  Cost: ${optimal_row['cost']:,.0f}")
print(f"  Precision: {optimal_row['precision']:.1%}")
print(f"  Recall: {optimal_row['recall']:.1%}")

In [None]:
# Calculate savings
original_cost = total_cost
optimal_cost = optimal_row['cost']
savings = original_cost - optimal_cost

print(f"\n=== SAVINGS ===")
print(f"  Original (threshold=0.5): ${original_cost:,.0f}")
print(f"  Optimal (threshold={optimal_row['threshold']:.2f}): ${optimal_cost:,.0f}")
print(f"  SAVINGS: ${savings:,.0f} ({savings/original_cost*100:.0f}% reduction!)")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
ax1.plot(results_df['threshold'], results_df['cost'], 'purple', linewidth=2)
ax1.axvline(x=0.5, color='red', linestyle='--', label='Original (0.5)', alpha=0.7)
ax1.axvline(x=optimal_row['threshold'], color='green', linestyle='--', 
            label=f'Optimal ({optimal_row["threshold"]:.2f})', alpha=0.7)
ax1.scatter([0.5], [original_cost], color='red', s=100, zorder=5)
ax1.scatter([optimal_row['threshold']], [optimal_cost], color='green', s=100, zorder=5)
ax1.set_xlabel('Threshold')
ax1.set_ylabel('Total Cost ($)')
ax1.set_title('Business Cost by Threshold')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2 = axes[1]
ax2.plot(results_df['threshold'], results_df['precision'], 'b-', label='Precision', linewidth=2)
ax2.plot(results_df['threshold'], results_df['recall'], 'r-', label='Recall', linewidth=2)
ax2.axvline(x=0.5, color='gray', linestyle='--', alpha=0.5)
ax2.axvline(x=optimal_row['threshold'], color='green', linestyle='--', alpha=0.7)
ax2.set_xlabel('Threshold')
ax2.set_ylabel('Score')
ax2.set_title('Precision-Recall Tradeoff')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Sample Postmortem

### What happened:
- The churn model was deployed with the default threshold of 0.5
- This resulted in high retention campaign costs because we missed many churners (high FN)

### Root cause:
- The default threshold (0.5) doesn't account for asymmetric costs
- Missing a churner ($200) costs 4x more than a wasted offer ($50)
- Optimal threshold is ~0.20, not 0.50

### How to prevent:
- Always calculate optimal threshold based on business costs before deployment
- Use the formula: optimal_threshold â‰ˆ FP_cost / (FP_cost + FN_cost)
- Never assume 0.5 is the right threshold