In [1]:
# ===============================
# Telco Customer Churn ML Pipeline
# ===============================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import joblib

# -------------------------------
# 1. Load Dataset
# -------------------------------
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head(5)

# Drop customerID (not useful for prediction)
df.drop("customerID", axis=1, inplace=True)

# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Handle missing values
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

# Target variable
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

# -------------------------------
# 2. Feature / Target Split
# -------------------------------
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Identify feature types
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

# -------------------------------
# 3. Preprocessing Pipeline
# -------------------------------
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# -------------------------------
# 4. Models
# -------------------------------
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(random_state=42)

# -------------------------------
# 5. Full Pipelines
# -------------------------------
pipe_lr = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", log_reg)
])

pipe_rf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", rf)
])

# -------------------------------
# 6. Hyperparameter Grids
# -------------------------------
param_grid_lr = {
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__penalty": ["l2"],
    "classifier__solver": ["lbfgs"]
}

param_grid_rf = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5]
}

# -------------------------------
# 7. Train/Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# 8. GridSearchCV
# -------------------------------
grid_lr = GridSearchCV(
    pipe_lr,
    param_grid_lr,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_rf = GridSearchCV(
    pipe_rf,
    param_grid_rf,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Fit models
grid_lr.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)

# -------------------------------
# 9. Evaluation
# -------------------------------
best_lr = grid_lr.best_estimator_
best_rf = grid_rf.best_estimator_

y_pred_lr = best_lr.predict(X_test)
y_pred_rf = best_rf.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# -------------------------------
# 10. Save Best Model Pipeline
# -------------------------------
best_model = best_rf if grid_rf.best_score_ > grid_lr.best_score_ else best_lr

joblib.dump(best_model, "telco_churn_pipeline.pkl")

print("Pipeline saved as telco_churn_pipeline.pkl")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)


Logistic Regression Accuracy: 0.8055358410220014
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Random Forest Accuracy: 0.7998580553584103
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.52      0.58       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409

Pipeline saved as telco_churn_pipeline.pkl
