# Model Training - Telco Customer Churn

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

### Daten laden

In [2]:
# Datensätze laden (sind schon aufgeteilt: 70% train, 20% test, 10% validation)
train_data = pd.read_csv('../../data/day_3/telco-customer-churn/train.csv')
test_data = pd.read_csv('../../data/day_3/telco-customer-churn/test.csv')
val_data = pd.read_csv('../../data/day_3/telco-customer-churn/validation.csv')

print(f"Train: {train_data.shape}")
print(f"Test: {test_data.shape}")
print(f"Validation: {val_data.shape}")

Train: (4225, 52)
Test: (1409, 52)
Validation: (1409, 52)


In [3]:
train_data.head()

Unnamed: 0,Age,Avg Monthly GB Download,Avg Monthly Long Distance Charges,Churn Category,Churn Reason,Churn Score,City,CLTV,Contract,Country,...,Tenure in Months,Total Charges,Total Extra Data Charges,Total Long Distance Charges,Total Refunds,Total Revenue,Under 30,Unlimited Data,Zip Code,Churn
0,72,4,19.44,,,51,San Mateo,4849,Two Year,United States,...,25,2191.15,0,486.0,0.0,2677.15,0,1,94403,0
1,27,59,45.62,,,27,Sutter Creek,3715,Month-to-Month,United States,...,35,3418.2,0,1596.7,0.0,5014.9,1,1,95685,0
2,59,0,16.07,,,59,Santa Cruz,5092,Month-to-Month,United States,...,46,851.2,0,739.22,0.0,1590.42,0,0,95064,0
3,25,27,0.0,,,49,Brea,2068,One Year,United States,...,27,1246.4,30,0.0,0.0,1276.4,1,0,92823,0
4,31,21,17.22,Dissatisfaction,Network reliability,88,San Jose,4026,One Year,United States,...,58,3563.8,0,998.76,0.0,4562.56,0,1,95117,1


In [4]:
# kurz checken wie die churn verteilung aussieht
print("Churn Verteilung (Train):")
print(train_data['Churn'].value_counts())
print(f"\nChurn Rate: {train_data['Churn'].mean()*100:.1f}%")

Churn Verteilung (Train):
Churn
0    3104
1    1121
Name: count, dtype: int64

Churn Rate: 26.5%


### Data Preparation

In [5]:
# spalten die wir nicht brauchen rauswerfen
# - Customer ID ist nur ne ID
# - Churn Category/Reason wissen wir ja erst nachdem der kunde weg ist (data leakage)
# - Location stuff ist zu granular

drop_cols = [
    'Customer ID', 'Churn', 'Churn Category', 'Churn Reason','Churn Score', 'Customer Status',
    'City', 'State', 'Country', 'Zip Code', 'Lat Long', 'Latitude', 'Longitude'
]

# nur droppen was auch existiert
drop_cols = [c for c in drop_cols if c in train_data.columns]

X_train = train_data.drop(columns=drop_cols)
y_train = train_data['Churn']

X_test = test_data.drop(columns=drop_cols)
y_test = test_data['Churn']

X_val = val_data.drop(columns=drop_cols)
y_val = val_data['Churn']

print(f"Features: {X_train.shape[1]}")
print(f"Train samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Val samples: {len(X_val)}")

Features: 39
Train samples: 4225
Test samples: 1409
Val samples: 1409


In [6]:
# kategorische spalten finden
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"Kategorische Spalten: {len(cat_cols)}")
print(cat_cols)

Kategorische Spalten: 6
['Contract', 'Gender', 'Internet Type', 'Offer', 'Payment Method', 'Quarter']


In [7]:
# one-hot encoding für kategorische variablen
X_train_encoded = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, columns=cat_cols, drop_first=True)

# falls test/val andere kategorien haben, müssen wir die spalten angleichen
# fehlende spalten mit 0 auffüllen
for col in X_train_encoded.columns:
    if col not in X_test_encoded.columns:
        X_test_encoded[col] = 0
    if col not in X_val_encoded.columns:
        X_val_encoded[col] = 0

# gleiche spaltenreihenfolge
X_test_encoded = X_test_encoded[X_train_encoded.columns]
X_val_encoded = X_val_encoded[X_train_encoded.columns]

print(f"Nach Encoding: {X_train_encoded.shape[1]} Features")

Nach Encoding: 44 Features


In [8]:
# missing values checken
missing = X_train_encoded.isnull().sum()
print(f"Missing values: {missing.sum()}")

if missing.sum() > 0:
    print(missing[missing > 0])
    # mit median auffüllen
    X_train_encoded = X_train_encoded.fillna(X_train_encoded.median())
    X_test_encoded = X_test_encoded.fillna(X_train_encoded.median())
    X_val_encoded = X_val_encoded.fillna(X_train_encoded.median())

Missing values: 0


In [9]:
# features skalieren (wichtig für logistic regression, svm, knn)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)
X_val_scaled = scaler.transform(X_val_encoded)

print("Skalierung done")

Skalierung done


### Model Training

In [10]:
# verschiedene modelle ausprobieren
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(probability=True, random_state=42)
}

results = []

for name, model in models.items():
    print(f"Training {name}...", end=" ")
    
    # random forest braucht keine skalierung
    if name == 'Random Forest':
        model.fit(X_train_encoded, y_train)
        y_pred_val = model.predict(X_val_encoded)
        y_prob_val = model.predict_proba(X_val_encoded)[:, 1]
    else:
        model.fit(X_train_scaled, y_train)
        y_pred_val = model.predict(X_val_scaled)
        y_prob_val = model.predict_proba(X_val_scaled)[:, 1]
    
    # metriken auf validation set
    acc = accuracy_score(y_val, y_pred_val)
    auc = roc_auc_score(y_val, y_prob_val)
    
    results.append({
        'Model': name,
        'Accuracy': acc,
        'ROC-AUC': auc,
        'model_obj': model
    })
    
    print(f"Acc: {acc:.3f}, AUC: {auc:.3f}")

Training Logistic Regression... Acc: 0.961, AUC: 0.993
Training Random Forest... Acc: 0.964, AUC: 0.989
Training Gradient Boosting... Acc: 0.965, AUC: 0.993
Training KNN... Acc: 0.884, AUC: 0.921
Training SVM... Acc: 0.964, AUC: 0.991


In [11]:
# ergebnisse vergleichen
results_df = pd.DataFrame(results)[['Model', 'Accuracy', 'ROC-AUC']]
results_df = results_df.sort_values('ROC-AUC', ascending=False)
results_df

Unnamed: 0,Model,Accuracy,ROC-AUC
2,Gradient Boosting,0.965224,0.993472
0,Logistic Regression,0.960965,0.99264
4,SVM,0.963804,0.991443
1,Random Forest,0.963804,0.989367
3,KNN,0.884315,0.920676


In [12]:
# bestes modell raussuchen
best = max(results, key=lambda x: x['ROC-AUC'])
best_model = best['model_obj']
best_name = best['Model']
print(f"Bestes Modell: {best_name}")

Bestes Modell: Gradient Boosting
