In [2]:
import pandas as pd

url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)

print("df shape:", df.shape)
df.head()

df shape: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
# TARGET
y = df["Churn"].map({"Yes": 1, "No": 0})

# FEATURES
X_df = df.drop(columns=["Churn", "customerID"], errors="ignore")

# Convert TotalCharges safely
if "TotalCharges" in X_df.columns:
    X_df["TotalCharges"] = pd.to_numeric(X_df["TotalCharges"], errors="coerce")

# Combine X and y so rows stay aligned
full_df = pd.concat([X_df, y], axis=1)

# Drop rows with missing values
full_df = full_df.dropna()

# Split back
y = full_df["Churn"]
X_df = full_df.drop(columns=["Churn"])

print("Final shapes:", X_df.shape, y.shape)

Final shapes: (7032, 19) (7032,)


In [8]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# 1) Ensure X_df is a DataFrame and y exists
assert "X_df" in globals(), "X_df not found. Run Step 2 first."
assert "y" in globals(), "y not found. Run Step 2 first."

# 2) Split
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y
)

# 3) Identify columns
num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=["number"]).columns.tolist()

print("Numeric cols:", len(num_cols))
print("Categorical cols:", len(cat_cols))

# 4) Preprocess pipelines
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols),
    ]
)

# 5) Model
model = LogisticRegression(max_iter=3000)

# 6) Full pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# 7) Fit + save
pipeline.fit(X_train, y_train)

joblib.dump(pipeline, "churn_pipeline.joblib")
print("✅ Saved churn_pipeline.joblib")

Numeric cols: 4
Categorical cols: 15
✅ Saved churn_pipeline.joblib
