In [6]:
!pip install -q pandas scikit-learn joblib

# ------------------------------
# Step 1: Import Libraries
# ------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# ------------------------------
# Step 2: Load Dataset (Working URL)
# ------------------------------
url = "https://raw.githubusercontent.com/YuehHanChen/Telco_Customer_Churn_Analysis/master/WA_Fn-UseC_-Telco-Customer-Churn.csv"
data = pd.read_csv(url)

print("Dataset Shape:", data.shape)
print(data.head())

# ------------------------------
# Step 3: Preprocessing
# ------------------------------
# Target variable
y = data["Churn"].map({"Yes": 1, "No": 0})

# Drop customerID
X = data.drop(columns=["customerID", "Churn"])

# Convert "TotalCharges" to numeric (handle missing/space values)
X["TotalCharges"] = pd.to_numeric(X["TotalCharges"], errors="coerce")
X["TotalCharges"].fillna(X["TotalCharges"].median(), inplace=True)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical:", categorical_cols)
print("Numerical:", numeric_cols)

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

# ------------------------------
# Step 4: Build Pipeline
# ------------------------------
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# ------------------------------
# Step 5: Train-Test Split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------------
# Step 6: Hyperparameter Tuning with GridSearchCV
# ------------------------------
param_grid = [
    {
        "classifier": [LogisticRegression(max_iter=1000)],
        "classifier__C": [0.1, 1, 10]
    },
    {
        "classifier": [RandomForestClassifier(random_state=42)],
        "classifier__n_estimators": [50, 100],
        "classifier__max_depth": [5, 10, None]
    }
]

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("✅ Best Parameters:", grid_search.best_params_)
print("✅ Best CV Accuracy:", grid_search.best_score_)

# ------------------------------
# Step 7: Evaluate on Test Set
# ------------------------------
y_pred = grid_search.predict(X_test)
print("\n✅ Test Accuracy:", accuracy_score(y_test, y_pred))
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred))

# ------------------------------
# Step 8: Export Final Pipeline
# ------------------------------
joblib.dump(grid_search.best_estimator_, "customer_churn_pipeline.pkl")
print("✅ Pipeline saved as customer_churn_pipeline.pkl")

# ------------------------------
# Step 9: Download the pipeline file (optional in Colab)
# ------------------------------
from google.colab import files
files.download("customer_churn_pipeline.pkl")


Dataset Shape: (7043, 21)
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingM

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X["TotalCharges"].fillna(X["TotalCharges"].median(), inplace=True)


✅ Best Parameters: {'classifier': LogisticRegression(max_iter=1000), 'classifier__C': 0.1}
✅ Best CV Accuracy: 0.8047568335108272

✅ Test Accuracy: 0.7998580553584103

✅ Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.65      0.55      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409

✅ Pipeline saved as customer_churn_pipeline.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>