<a href="https://colab.research.google.com/github/FarazTheAnalyst/Data-Scientist-Portfolio/blob/main/TelcoCustomerChurnPredition/Churn_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
#library importing

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import matplotlib.pyplot as plt
import seaborn as sns


Data Processing and Model Training

In [16]:
# Load data (original from Kaggle: https://www.kaggle.com/datasets/blastchar/telco-customer-churn)

from google.colab import files
uploaded = files.upload()

Saving WA_Fn-UseC_-Telco-Customer-Churn.csv to WA_Fn-UseC_-Telco-Customer-Churn.csv


In [17]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
df = pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head(10)

# Data *Cleaning*

In [None]:
df.dtypes

In [None]:
def clean_churn_data(df):
  # Convert TotoalCharges to numeric, to handle empty strings
  df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

  # fill missing values
  df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())


  # drop a customerID as its not a feature
  df.drop("customerID", axis=1, inplace=True)

  # convert Churn to binary
  df["Churn"] = df["Churn"].apply(lambda x: 1 if x == "Yes" else 0)
  return df




In [None]:
df_clean = clean_churn_data(df)

# Feature *Engineering*

In [None]:
def engineer_feature(df):
  # create tenure groups
  df["TenureGroup"] = pd.cut(df["tenure"], bins=[-1, 12, 24, 48, 72, np.inf],
                             labels=("0-1yr", "1-2yr", "2-4yr", "4-6yr", "6+yr"))

  # create a monthly charge group
  df["MonthlyChargeGroup"] = pd.cut(df["MonthlyCharges"], bins=[0, 35, 70, 90, np.inf],
                                    labels=["low", "medium", "high", "very high"], include_lowest=True)   #or start with -0.1

  # create total charge to monthlycharge ratio
  df["chargeRatio"] = df["TotalCharges"] / df["MonthlyCharges"]
  df["chargeRatio"] = df["chargeRatio"].replace([np.inf, -np.inf], 0)

  return df

clean_df = engineer_feature(df_clean)
clean_df

Seperate Features and *Target*

In [None]:
# Seperate Features and Target

X = clean_df.drop("Churn", axis=1)
y = clean_df["Churn"]

In [None]:
feature_cols = X.columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

*Preprocessing*

In [None]:
# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

In [None]:
# scale numerical columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [None]:
# check imbalance
unique, count = np.unique(y, return_counts=True)
churn_train_dict_value_count = {k:v for(k,v) in zip(unique, count)}
churn_train_dict_value_count

In [None]:
# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, random_state=42, test_size=0.2)

Model Comparison Function

In [None]:
def compare_models():
  models = {
      "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
      "Random Forest": RandomForestClassifier(random_state=42),
      "XGBoost": XGBClassifier(random_state=42, eval_metric="logloss")
  }

  results = []

  for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    auc = roc_auc_score(y_test, y_pred)

    results.append({
        "Model": name,
        "Accuracy": report["accuracy"],
        "Precision_0": report["0"]["precision"],
        "Recall_0": report["0"]["recall"],
        "F1_0": report["0"]["f1-score"],
        "Precision_1": report["1"]["precision"],
        "Recall_1": report["1"]["recall"],
        "F1_1": report["1"]["f1-score"],
        "AUC": auc

    })

    # Save confusion matrix for the best model
    if name == "XGBoost":
      plt.figure(figsize=(8, 6))
      cm = confusion_matrix(y_test, y_pred)
      sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
      plt.title("Confusion Matrix - XGBoost")
      plt.xlabel("Predicted")
      plt.ylabel("Actual")
      plt.tight_layout()
      plt.savefig("confusion_matrix_xgboost.png")

      # Plot feature importance
      if hasattr(model, "feature_importances_"):
        plt.figure(figsize=(10, 8))
        feature_importance = pd.DataFrame({
            "feature": X.columns,
            "importances": model.feature_importances_
        }).sort_values("importances", ascending=False).head(10)

        sns.barplot(x="importances", y="feature", data=feature_importance)
        plt.title("Top 10 Important Features - XGBoost")
        plt.tight_layout()
        plt.savefig("feature_importances_xgboost.png")

  return pd.DataFrame(results)


In [None]:
# Compare models
results_df = compare_models()
print(f"Model Comparision Results:")
print(results_df.to_string(index=False))

In [None]:
# Hyper parameter tuning for best model
print("\nPerforming hyperparameter tuning for XGBoost...")
param_grid = {
    "max_depth": [3, 5, 7],
    "Learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [100, 200, 300],
    "subsample": [0.8, 0.9, 1.0]
}

xgb = XGBClassifier(random_state=42, eval_metric='logloss')
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid,
                          cv=3, scoring='roc_auc', n_jobs=-1)

grid_search.fit(X_train, y_train)


print(f"Best Prammeter: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

In [None]:
# Train and save the best Model
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Save model and preprocessing objects
joblib.dump(best_model, "churn_predictor_model.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(feature_cols, "feature_cols.pkl")
joblib.dump(categorical_cols, "categorical_cols.pkl")
joblib.dump(numerical_cols, "numerical_cols.pkl")

print("\nModel training complete. best model saved as 'churn_predictor.pkl")