In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib

In [2]:
df = pd.read_csv("telco_churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Drop customerID (non-informative)
df = df.drop('customerID', axis=1)

# Target variable (Churn): convert 'Yes'/'No' to 1/0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Handle missing or whitespace values if any
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Identify columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols.remove('Churn')

print("Categorical:", cat_cols)
print("Numerical:", num_cols)


Categorical: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numerical: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


In [4]:
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)


In [6]:
logreg_pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(max_iter=500, solver='liblinear'))
])


In [7]:
rf_pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])


In [8]:
rf_pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])


In [9]:
logreg_params = {
    'clf__C': [0.01, 0.1, 1, 10]
}

logreg_grid = GridSearchCV(
    logreg_pipe, logreg_params, cv=3, scoring='f1', n_jobs=-1
)
logreg_grid.fit(X_train, y_train)

print("Best Logistic Regression Params:", logreg_grid.best_params_)


Best Logistic Regression Params: {'clf__C': 10}


In [10]:
rf_params = {
    'clf__n_estimators': [50, 100],
    'clf__max_depth': [5, 10, None]
}

rf_grid = GridSearchCV(
    rf_pipe, rf_params, cv=3, scoring='f1', n_jobs=-1
)
rf_grid.fit(X_train, y_train)

print("Best Random Forest Params:", rf_grid.best_params_)


Best Random Forest Params: {'clf__max_depth': 10, 'clf__n_estimators': 100}


In [11]:
for name, model in [("Logistic Regression", logreg_grid), ("Random Forest", rf_grid)]:
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    print("-" * 40)


Model: Logistic Regression
Accuracy: 0.8048 | F1 Score: 0.6032
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.80      0.80      1409

----------------------------------------
Model: Random Forest
Accuracy: 0.8027 | F1 Score: 0.5875
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.53      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.80      1409

----------------------------------------


In [12]:
# Save best pipeline (e.g., Random Forest)
joblib.dump(rf_grid.best_estimator_, "churn_rf_pipeline.joblib")
# Save Logistic Regression if desired
joblib.dump(logreg_grid.best_estimator_, "churn_logreg_pipeline.joblib")


['churn_logreg_pipeline.joblib']

In [13]:
# Load
model_loaded = joblib.load("churn_rf_pipeline.joblib")
# Predict
sample = X_test.iloc[[0]]
pred = model_loaded.predict(sample)
print("Prediction (0=No churn, 1=Churn):", pred)


Prediction (0=No churn, 1=Churn): [0]
