In [18]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import os

In [19]:
# Step 2: Load dataset (Update path if needed)
file_path = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'  # Make sure it's in the same directory
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File '{file_path}' not found!")

data = pd.read_csv(file_path)

In [20]:
# Step 3: Clean data
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data.drop('customerID', axis=1, inplace=True)
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

In [21]:
# Step 4: Feature Engineering
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = [col for col in data.columns if col not in numerical_features + ['Churn']]

X = data.drop('Churn', axis=1)
y = data['Churn']

In [22]:
# Step 5: Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [23]:
# Step 6: Pipelines for preprocessing
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, categorical_features)
])

In [24]:
# Step 7: Full pipeline with a placeholder model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [25]:
# Step 8: Grid search parameters
param_grid = [
    {
        'classifier': [LogisticRegression(max_iter=1000, class_weight='balanced')],
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l2']
    },
    {
        'classifier': [RandomForestClassifier(class_weight='balanced', random_state=42)],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [10, 20, None],
        'classifier__min_samples_split': [2, 5]
    }
]

In [26]:
# Step 9: Train using GridSearchCV
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score on CV:", grid_search.best_score_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best Parameters: {'classifier': RandomForestClassifier(class_weight='balanced', random_state=42), 'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best F1-Score on CV: 0.6364589941138907


In [27]:
# Step 10: Evaluate on test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("\n📊 Test Set Classification Report:")
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_pred_proba))


📊 Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.77      0.83      1035
           1       0.54      0.74      0.62       374

    accuracy                           0.76      1409
   macro avg       0.72      0.76      0.73      1409
weighted avg       0.80      0.76      0.77      1409

AUC-ROC Score: 0.8395864010953524


In [28]:
# Step 11: Save the model
joblib.dump(best_model, 'churn_pipeline.pkl')
print("\nModel saved as 'churn_pipeline.pkl'")


Model saved as 'churn_pipeline.pkl'


In [29]:
# Step 12: Load and predict a sample
loaded_model = joblib.load('churn_pipeline.pkl')
sample = X_test.iloc[[0]]
sample_prediction = loaded_model.predict(sample)[0]
print("\nSample Prediction:", "Churn" if sample_prediction else "No Churn")


Sample Prediction: No Churn
