In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score
import joblib


In [None]:
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)
print("ðŸ”¥ Dataset Shape:", df.shape)
print("\nðŸ“Š First 5 Rows:")
print(df.head())

print("\nðŸ§¾ Data Info (dtypes, missing values):")
print(df.info())

In [None]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors="coerce")
df.dropna(inplace=True)
df.drop('customerID',axis=1,inplace=True)
df['Churn']=df['Churn'].map({'Yes':1,'No':0})
print(df.shape)
print(df.head(3))

In [None]:
x=df.drop('Churn',axis=1)
y=df['Churn']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)
print(x_train.shape,x_test.shape)
print(y_train.value_counts(normalize=True).map(lambda x: f"{x:.1%}"))

In [None]:
numerical_features=x.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = x.select_dtypes(include=['object']).columns.tolist()
print(numerical_features)
print(categorical_features)

In [None]:
preprocessor=ColumnTransformer(
    transformers=[('num',StandardScaler(),numerical_features),
                  ('cat',OneHotEncoder(),categorical_features)],
                  remainder='passthrough'
)

In [None]:
model_pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',LogisticRegression(random_state=42,max_iter=1000))
])

#Training
model_pipeline.fit(x_train,y_train)

#Evaluate
y_pred_lr=model_pipeline.predict(x_test)
print("Random forest performance")
print(classification_report(y_test,y_pred_lr))

In [None]:
rf_pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier(random_state=42))
])

#Training
rf_pipeline.fit(x_train,y_train)

#Evaluate
y_pred_rf=rf_pipeline.predict(x_test)
print("Random forest performance")
print(classification_report(y_test,y_pred_rf))

In [None]:
#Logistic Regression
y_pred_lr=model_pipeline.predict(x_test)
y_pred_proba_lr=model_pipeline.predict_proba(x_test)[:,1]
# Random Forest
y_pred_rf = rf_pipeline.predict(x_test)
y_pred_proba_rf = rf_pipeline.predict_proba(x_test)[:, 1]

# AUC scores
auc_lr = roc_auc_score(y_test, y_pred_proba_lr)
auc_rf = roc_auc_score(y_test, y_pred_rf)

print("ðŸ“Š Model Comparison")
print(f"Logistic Regression AUC: {auc_lr:.4f}, Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Random Forest AUC:      {auc_rf:.4f}, Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")

In [None]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 15, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__class_weight': ['balanced', None]
}
grid = GridSearchCV(
    rf_pipeline,           # your Random Forest pipeline
    param_grid,
    cv=5,                  # 5-fold cross-validation
    scoring='roc_auc',     # optimize for AUC
    n_jobs=-1,
    verbose=1
)
grid.fit(x_train,y_train)
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation AUC:", grid.best_score_.round(4))

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred_best = grid.best_estimator_.predict(x_test)
y_pred_proba_best = grid.best_estimator_.predict_proba(x_test)[:, 1]

print("Final Model - Classification Report:")
print(classification_report(y_test, y_pred_best))

print(f"Final Test AUC: {roc_auc_score(y_test, y_pred_proba_best):.4f}")

In [None]:
import joblib
joblib.dump(grid.best_estimator_, 'churn_prediction_pipeline.pkl')
print("ðŸ’¾ Model saved!")