# Model Training - Telco Customer Churn

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

### Load Data

In [None]:
# Datensätze laden (sind schon aufgeteilt: 70% train, 20% test, 10% validation)
train_data = pd.read_csv('../../data/day_3/telco-customer-churn/train.csv')
test_data = pd.read_csv('../../data/day_3/telco-customer-churn/test.csv')
val_data = pd.read_csv('../../data/day_3/telco-customer-churn/validation.csv')

print(f"Train: {train_data.shape}")
print(f"Test: {test_data.shape}")
print(f"Validation: {val_data.shape}")

In [None]:
train_data.head()

In [None]:
# kurz checken wie die churn verteilung aussieht
print("Churn Verteilung (Train):")
print(train_data['Churn'].value_counts())
print(f"\nChurn Rate: {train_data['Churn'].mean()*100:.1f}%")

### Data Preparation

In [None]:
# spalten die wir nicht brauchen rauswerfen
# - Customer ID ist nur ne ID
# - Churn Category/Reason wissen wir ja erst nachdem der kunde weg ist (data leakage)
# - Location stuff ist zu granular

drop_cols = [
    'Customer ID', 'Churn', 'Churn Category', 'Churn Reason', 'Customer Status',
    'City', 'State', 'Country', 'Zip Code', 'Lat Long', 'Latitude', 'Longitude'
]

# nur droppen was auch existiert
drop_cols = [c for c in drop_cols if c in train_data.columns]

X_train = train_data.drop(columns=drop_cols)
y_train = train_data['Churn']

X_test = test_data.drop(columns=drop_cols)
y_test = test_data['Churn']

X_val = val_data.drop(columns=drop_cols)
y_val = val_data['Churn']

print(f"Features: {X_train.shape[1]}")
print(f"Train samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Val samples: {len(X_val)}")

In [None]:
# kategorische spalten finden
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"Kategorische Spalten: {len(cat_cols)}")
print(cat_cols)

In [None]:
# one-hot encoding für kategorische variablen
X_train_encoded = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, columns=cat_cols, drop_first=True)

# falls test/val andere kategorien haben, müssen wir die spalten angleichen
# fehlende spalten mit 0 auffüllen
for col in X_train_encoded.columns:
    if col not in X_test_encoded.columns:
        X_test_encoded[col] = 0
    if col not in X_val_encoded.columns:
        X_val_encoded[col] = 0

# gleiche spaltenreihenfolge
X_test_encoded = X_test_encoded[X_train_encoded.columns]
X_val_encoded = X_val_encoded[X_train_encoded.columns]

print(f"Nach Encoding: {X_train_encoded.shape[1]} Features")

In [None]:
# missing values checken
missing = X_train_encoded.isnull().sum()
print(f"Missing values: {missing.sum()}")

if missing.sum() > 0:
    print(missing[missing > 0])
    # mit median auffüllen
    X_train_encoded = X_train_encoded.fillna(X_train_encoded.median())
    X_test_encoded = X_test_encoded.fillna(X_train_encoded.median())
    X_val_encoded = X_val_encoded.fillna(X_train_encoded.median())

In [None]:
# features skalieren (wichtig für logistic regression, svm, knn)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)
X_val_scaled = scaler.transform(X_val_encoded)

print("Skalierung done")

### Model Training

In [None]:
# verschiedene modelle ausprobieren
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(probability=True, random_state=42)
}

results = []

for name, model in models.items():
    print(f"Training {name}...", end=" ")
    
    # random forest braucht keine skalierung
    if name == 'Random Forest':
        model.fit(X_train_encoded, y_train)
        y_pred_val = model.predict(X_val_encoded)
        y_prob_val = model.predict_proba(X_val_encoded)[:, 1]
    else:
        model.fit(X_train_scaled, y_train)
        y_pred_val = model.predict(X_val_scaled)
        y_prob_val = model.predict_proba(X_val_scaled)[:, 1]
    
    # metriken auf validation set
    acc = accuracy_score(y_val, y_pred_val)
    auc = roc_auc_score(y_val, y_prob_val)
    
    results.append({
        'Model': name,
        'Accuracy': acc,
        'ROC-AUC': auc,
        'model_obj': model
    })
    
    print(f"Acc: {acc:.3f}, AUC: {auc:.3f}")

In [None]:
# ergebnisse vergleichen
results_df = pd.DataFrame(results)[['Model', 'Accuracy', 'ROC-AUC']]
results_df = results_df.sort_values('ROC-AUC', ascending=False)
results_df

In [None]:
# bestes modell raussuchen
best = max(results, key=lambda x: x['ROC-AUC'])
best_model = best['model_obj']
best_name = best['Model']
print(f"Bestes Modell: {best_name}")

### Evaluation on Test Set

In [None]:
# finale evaluation auf dem test set (das haben wir bisher nicht angefasst!)
if best_name == 'Random Forest':
    y_pred_test = best_model.predict(X_test_encoded)
    y_prob_test = best_model.predict_proba(X_test_encoded)[:, 1]
else:
    y_pred_test = best_model.predict(X_test_scaled)
    y_prob_test = best_model.predict_proba(X_test_scaled)[:, 1]

test_acc = accuracy_score(y_test, y_pred_test)
test_auc = roc_auc_score(y_test, y_prob_test)

print(f"=== Test Set Ergebnisse ({best_name}) ===")
print(f"Accuracy: {test_acc:.3f}")
print(f"ROC-AUC: {test_auc:.3f}")

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Churn', 'Churn'],
            yticklabels=['No Churn', 'Churn'])
plt.title(f'Confusion Matrix - {best_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

tn, fp, fn, tp = cm.ravel()
print(f"True Negatives: {tn}")
print(f"True Positives: {tp}")
print(f"False Negatives: {fn} (Churner nicht erkannt!)")
print(f"False Positives: {fp}")

In [None]:
# roc curve
plt.figure(figsize=(8, 6))

for r in results:
    name = r['Model']
    model = r['model_obj']
    
    if name == 'Random Forest':
        y_prob = model.predict_proba(X_test_encoded)[:, 1]
    else:
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)
    plt.plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Vergleich')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
# classification report
print(classification_report(y_test, y_pred_test, target_names=['No Churn', 'Churn']))

### Feature Importance

In [None]:
# feature importance vom random forest (auch wenn es nicht das beste modell ist, interessant zu sehen)
rf_model = [r['model_obj'] for r in results if r['Model'] == 'Random Forest'][0]

feat_imp = pd.DataFrame({
    'Feature': X_train_encoded.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

# top 15 plotten
plt.figure(figsize=(10, 8))
sns.barplot(data=feat_imp.head(15), x='Importance', y='Feature')
plt.title('Top 15 Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# top 10 ausgeben
print("Top 10 wichtigste Features:")
print(feat_imp.head(10).to_string(index=False))

### Summary

In [None]:
print("="*50)
print("ZUSAMMENFASSUNG")
print("="*50)
print(f"\nBestes Modell: {best_name}")
print(f"Test Accuracy: {test_acc*100:.1f}%")
print(f"Test ROC-AUC: {test_auc:.3f}")
print(f"\nChurner erkannt: {tp} von {tp+fn} ({tp/(tp+fn)*100:.1f}% Recall)")
print(f"False Alarms: {fp}")