In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna(subset=['TotalCharges'])
df = df.drop('customerID', axis=1)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

X = df.drop('Churn', axis=1)
y = df['Churn']

num_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
cat_columns = X.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_columns),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_columns)
], remainder='passthrough')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Shape: {X_train_processed.shape}")
print(f"Churn rate: {y_train.mean():.3f}")

Shape: (5625, 30)
Churn rate: 0.266


Technique 1: Class Weights

In [3]:
lr_weighted = LogisticRegression(random_state=42, max_iter=1000, 
                                  class_weight='balanced')
lr_weighted.fit(X_train_processed, y_train)

lr_pred = lr_weighted.predict(X_test_processed)
lr_prob = lr_weighted.predict_proba(X_test_processed)[:, 1]

print("=== Logistic Regression (Class Weights) ===")
print(f"Accuracy:  {accuracy_score(y_test, lr_pred):.4f}")
print(f"Precision: {precision_score(y_test, lr_pred):.4f}")
print(f"Recall:    {recall_score(y_test, lr_pred):.4f}")
print(f"F1 Score:  {f1_score(y_test, lr_pred):.4f}")
print(f"AUC-ROC:   {roc_auc_score(y_test, lr_prob):.4f}")

=== Logistic Regression (Class Weights) ===
Accuracy:  0.7257
Precision: 0.4901
Recall:    0.7941
F1 Score:  0.6061
AUC-ROC:   0.8353


In [4]:
cm = confusion_matrix(y_test, lr_pred)
print("Confusion Matrix:")
print(f"TN: {cm[0,0]}  FP: {cm[0,1]}")
print(f"FN: {cm[1,0]}  TP: {cm[1,1]}")

Confusion Matrix:
TN: 724  FP: 309
FN: 77  TP: 297


Technique 2: Threshold Tuning

In [5]:
# Get probabilities
lr_prob = lr_weighted.predict_proba(X_test_processed)[:, 1]

# Try different thresholds
for threshold in [0.5, 0.4, 0.3, 0.2]:
    pred = (lr_prob >= threshold).astype(int)
    rec = recall_score(y_test, pred)
    prec = precision_score(y_test, pred)
    print(f"Threshold {threshold}: Recall={rec:.3f}, Precision={prec:.3f}")

Threshold 0.5: Recall=0.794, Precision=0.490
Threshold 0.4: Recall=0.872, Precision=0.452
Threshold 0.3: Recall=0.930, Precision=0.415
Threshold 0.2: Recall=0.960, Precision=0.386


In [6]:
threshold = 0.3
pred = (lr_prob >= threshold).astype(int)

cm = confusion_matrix(y_test, pred)
fn = cm[1,0]  # Missed churners
fp = cm[0,1]  # False alarms

cost_fn = fn * 500  # Lost customer
cost_fp = fp * 50   # Wasted offer

print(f"False Negatives: {fn} × $500 = ${cost_fn:,}")
print(f"False Positives: {fp} × $50 = ${cost_fp:,}")
print(f"Total Cost: ${cost_fn + cost_fp:,}")

False Negatives: 26 × $500 = $13,000
False Positives: 491 × $50 = $24,550
Total Cost: $37,550


In [7]:
best_cost = float('inf')
best_threshold = 0

for t in np.arange(0.1, 0.7, 0.05):
    pred = (lr_prob >= t).astype(int)
    cm = confusion_matrix(y_test, pred)
    fn = cm[1,0]
    fp = cm[0,1]
    cost = fn * 500 + fp * 50
    
    if cost < best_cost:
        best_cost = cost
        best_threshold = t
    
    print(f"Threshold {t:.2f}: FN={fn}, FP={fp}, Cost=${cost:,}")

print(f"\nOptimal Threshold: {best_threshold:.2f}")
print(f"Minimum Cost: ${best_cost:,}")

Threshold 0.10: FN=6, FP=700, Cost=$38,000
Threshold 0.15: FN=10, FP=626, Cost=$36,300
Threshold 0.20: FN=15, FP=571, Cost=$36,050
Threshold 0.25: FN=24, FP=529, Cost=$38,450
Threshold 0.30: FN=26, FP=491, Cost=$37,550
Threshold 0.35: FN=36, FP=448, Cost=$40,400
Threshold 0.40: FN=48, FP=395, Cost=$43,750
Threshold 0.45: FN=63, FP=349, Cost=$48,950
Threshold 0.50: FN=77, FP=309, Cost=$53,950
Threshold 0.55: FN=89, FP=268, Cost=$57,900
Threshold 0.60: FN=105, FP=226, Cost=$63,800
Threshold 0.65: FN=120, FP=188, Cost=$69,400

Optimal Threshold: 0.20
Minimum Cost: $36,050


In [8]:
threshold = 0.20
pred = (lr_prob >= threshold).astype(int)

print("=== Final Model (Class Weights + Threshold 0.20) ===")
print(f"Accuracy:  {accuracy_score(y_test, pred):.4f}")
print(f"Precision: {precision_score(y_test, pred):.4f}")
print(f"Recall:    {recall_score(y_test, pred):.4f}")
print(f"F1 Score:  {f1_score(y_test, pred):.4f}")
print(f"AUC-ROC:   {roc_auc_score(y_test, lr_prob):.4f}")

=== Final Model (Class Weights + Threshold 0.20) ===
Accuracy:  0.5835
Precision: 0.3860
Recall:    0.9599
F1 Score:  0.5506
AUC-ROC:   0.8353


## Class Imbalance Results

### Baseline vs Optimized

| Metric | Baseline | Optimized |
|--------|----------|-----------|
| Recall | 57.2% | 96.0% |
| Missed Churners | 160 | 15 |
| Business Cost | $85,750 | $36,050 |

### Techniques Used
1. **Class Weights**: `class_weight='balanced'` penalizes missed churners
2. **Threshold Tuning**: Lowered from 0.50 to 0.20
3. **Cost-Sensitive Optimization**: FN=$500, FP=$50

### Key Insight
Accuracy dropped from 80% to 58%, but business cost dropped by $49,700.
Optimizing for the right metric matters more than chasing accuracy.

In [10]:
import joblib

# Save the model and preprocessor
joblib.dump(lr_weighted, '../models/model.pkl')
joblib.dump(preprocessor, '../models/preprocessor.pkl')

print("Model and preprocessor saved!")

Model and preprocessor saved!
