#**Churn Prediction using Machine Learning**

**Customer Churn Prediction**

This project builds and compares machine learning models (XGBoost, Random Forest, Logistic Regression)  
to predict customer churn using structured data.  

**Goal:** Identify which customers are likely to leave, enabling proactive retention strategies.

**Tech Stack:** Python, scikit-learn, XGBoost, pandas, matplotlib, seaborn.  
**Key Techniques:** Imbalance handling, RandomizedSearchCV optimization, ROC-AUC evaluation.

##Setup

In [5]:
#Requirements
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [6]:
#Dataset setup
df = pd.read_csv('Customer_Churn_Processed.csv')
X = df.drop('ChurnStatus', axis=1)
y = df['ChurnStatus']
N_SAMPLES = len(df)

print(f"Total Samples: {N_SAMPLES}")
print(f"Churn (1) count: {sum(y == 1)}")
print(f"No Churn (0) count: {sum(y == 0)}")
print("-" * 50)

# --- 2. DATA SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set size: {len(X_train)} | Test set size: {len(X_test)}")
print("-" * 50)

# --- 3. IMBALANCE HANDLING: CALCULATE SCALE_POS_WEIGHT ---
# XGBoost handles imbalance by weighting the minority class during training.
# scale_pos_weight = (Count of Negative Samples) / (Count of Positive Samples)
neg_count = sum(y_train == 0)
pos_count = sum(y_train == 1)
scale_pos_weight_ratio = neg_count / pos_count

print(f"Calculated scale_pos_weight: {scale_pos_weight_ratio:.2f}")
print("This tells model to pay {:.2f} times more attention to churn cases.".format(scale_pos_weight_ratio))
print("-" * 50)

results = {'XGBoost':[], 'Random Forest':[], 'Logistic Regression':[]}


Total Samples: 6812
Churn (1) count: 1352
No Churn (0) count: 5460
--------------------------------------------------
Training set size: 5449 | Test set size: 1363
--------------------------------------------------
Calculated scale_pos_weight: 4.04
This tells model to pay 4.04 times more attention to churn cases.
--------------------------------------------------


##XGBoost

In [7]:
#Initialize XGBoost Classifier with imbalance handling and reproducibility settings
xgb = XGBClassifier(
    random_state=42,
    # Use the calculated ratio to manually balance the classes
    scale_pos_weight=scale_pos_weight_ratio,
    # Standard settings for classification
    use_label_encoder=False,
    eval_metric='logloss'
)

# --- 5. HYPERPARAMETER SEARCH SPACE (DISTRIBUTIONS FOR RANDOMIZED SEARCH) ---
# Define broad distributions for the most impactful XGBoost hyperparameters
param_distributions = {
    # Number of boosting rounds/trees (integer distribution)
    'n_estimators': randint(100, 600),
    # Maximum depth of a tree (integer distribution)
    'max_depth': randint(3, 10),
    # Step size shrinkage used in updates to prevent overfitting (log-uniform distribution)
    'learning_rate': uniform(0.01, 0.3),
    # L2 regularization term on weights (useful for weak signals)
    'reg_lambda': uniform(0.5, 2),
    # Minimum loss reduction required to make a further partition (controls pruning)
    'gamma': uniform(0, 0.5),
    # Subsample ratio of the training instance (fraction)
    'subsample': uniform(0.6, 0.4),
    # Subsample ratio of columns when constructing each tree
    'colsample_bytree': uniform(0.6, 0.4)
}

# --- 6. RANDOMIZED SEARCH CV SETUP AND EXECUTION ---
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=50,             # Number of different combinations to try
    cv=5,                  # 5-fold cross-validation
    scoring='roc_auc',     # Optimize for the robust ROC AUC metric
    verbose=1,
    random_state=42,
    n_jobs=-1              # Use all available CPU cores
)

print("Starting Randomized Search for XGBoost (Optimizing for ROC AUC)...")
random_search.fit(X_train, y_train)
print("Randomized Search Complete.")
print("-" * 50)


# --- 7. FINAL MODEL EVALUATION ---
best_xgb = random_search.best_estimator_

# A. Print Best Parameters and ROC AUC Score
print("Best Hyperparameters Found:")
print(random_search.best_params_)

print("\nBest CV ROC AUC Score:", random_search.best_score_)
print("-" * 50)


# B. Evaluate on the Test Set
print("Evaluating Final XGBoost Model on Test Set...")

# Get probability predictions (needed for ROC AUC)
y_proba = best_xgb.predict_proba(X_test)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_proba)
print(f"Test Set ROC AUC: {test_roc_auc:.4f}")
results['XGBoost'] = float(test_roc_auc)

# Get hard predictions (needed for Classification Report)
y_pred = best_xgb.predict(X_test)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))


# C. Confusion Matrix (Visualizing true positives/negatives)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print("              Predicted No Churn | Predicted Churn")
print(f"Actual No Churn:      {cm[0, 0]}          |      {cm[0, 1]}")
print(f"Actual Churn:         {cm[1, 0]}           |      {cm[1, 1]}")

Starting Randomized Search for XGBoost (Optimizing for ROC AUC)...
Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Randomized Search Complete.
--------------------------------------------------
Best Hyperparameters Found:
{'colsample_bytree': np.float64(0.6943939678995823), 'gamma': np.float64(0.12803416138066198), 'learning_rate': np.float64(0.022130076861529402), 'max_depth': 9, 'n_estimators': 114, 'reg_lambda': np.float64(0.7217816416236627), 'subsample': np.float64(0.7757346007463081)}

Best CV ROC AUC Score: 0.9874894549487367
--------------------------------------------------
Evaluating Final XGBoost Model on Test Set...
Test Set ROC AUC: 0.9964

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1092
           1       0.97      0.96      0.97       271

    accuracy                           0.99      1363
   macro avg       0.98      0.98      0.98      1363
weighted avg       0.99      0.99      0.99      1363

Confusion Matrix:
              Predicted No Churn | Predicted Churn
Actual No Churn:      

##Random Forests

In [8]:
#Initialize Random Forest Classifier with class weighting for imbalance
rf = RandomForestClassifier(
    random_state=42,
    class_weight='balanced',  # Automatically handle class imbalance
    n_jobs=-1                 # Use all CPU cores
)

# --- 5. HYPERPARAMETER SEARCH SPACE (DISTRIBUTIONS FOR RANDOMIZED SEARCH) ---
# Define broad distributions for key Random Forest hyperparameters
param_distributions = {
    # Number of trees in the forest
    'n_estimators': randint(100, 600),
    # Maximum depth of each tree
    'max_depth': randint(3, 20),
    # Minimum number of samples required to split an internal node
    'min_samples_split': randint(2, 20),
    # Minimum number of samples required to be at a leaf node
    'min_samples_leaf': randint(1, 10),
    # Number of features to consider when looking for the best split
    'max_features': ['sqrt', 'log2', None],
    # Whether bootstrap samples are used when building trees
    'bootstrap': [True, False]
}

# --- 6. RANDOMIZED SEARCH CV SETUP AND EXECUTION ---
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=50,             # Number of different combinations to try
    cv=5,                  # 5-fold cross-validation
    scoring='roc_auc',     # Optimize for the robust ROC AUC metric
    verbose=1,
    random_state=42,
    n_jobs=-1              # Use all available CPU cores
)

print("Starting Randomized Search for Random Forest (Optimizing for ROC AUC)...")
random_search.fit(X_train, y_train)
print("Randomized Search Complete.")
print("-" * 50)


# --- 7. FINAL MODEL EVALUATION ---
best_rf = random_search.best_estimator_

# A. Print Best Parameters and ROC AUC Score
print("Best Hyperparameters Found:")
print(random_search.best_params_)

print("\nBest CV ROC AUC Score:", random_search.best_score_)
print("-" * 50)


# B. Evaluate on the Test Set
print("Evaluating Final Random Forest Model on Test Set...")

# Get probability predictions (needed for ROC AUC)
y_proba = best_rf.predict_proba(X_test)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_proba)
print(f"Test Set ROC AUC: {test_roc_auc:.4f}")
results['Random Forest'] = float(test_roc_auc)

# Get hard predictions (needed for Classification Report)
y_pred = best_rf.predict(X_test)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))


# C. Confusion Matrix (Visualizing true positives/negatives)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print("              Predicted No Churn | Predicted Churn")
print(f"Actual No Churn:      {cm[0, 0]}          |      {cm[0, 1]}")
print(f"Actual Churn:         {cm[1, 0]}           |      {cm[1, 1]}")


Starting Randomized Search for Random Forest (Optimizing for ROC AUC)...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Randomized Search Complete.
--------------------------------------------------
Best Hyperparameters Found:
{'bootstrap': False, 'max_depth': 14, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 11, 'n_estimators': 575}

Best CV ROC AUC Score: 0.9943363790651544
--------------------------------------------------
Evaluating Final Random Forest Model on Test Set...
Test Set ROC AUC: 0.9961

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1092
           1       1.00      0.97      0.98       271

    accuracy                           0.99      1363
   macro avg       0.99      0.98      0.99      1363
weighted avg       0.99      0.99      0.99      1363

Confusion Matrix:
              Predicted No Churn | Predicted Churn
Actual No Churn:      10

##Logistic Regression

In [9]:
#Initialize Logistic Regression with class weighting for imbalance
log_reg = LogisticRegression(
    random_state=42,
    class_weight='balanced',   # Automatically adjust weights inversely to class frequencies
    solver='saga',             # Supports L1/L2 regularization and elastic net
    max_iter=5000,             # Ensure convergence for larger datasets
    n_jobs=-1
)

# --- 5. HYPERPARAMETER SEARCH SPACE (DISTRIBUTIONS FOR RANDOMIZED SEARCH) ---
# Define broad distributions for Logistic Regression hyperparameters
param_distributions = {
    # Regularization strength (inverse of regularization coefficient)
    # Smaller C = stronger regularization
    'C': uniform(0.001, 10),
    # Type of penalty (L1 = Lasso, L2 = Ridge)
    'penalty': ['l1', 'l2', 'elasticnet'],
    # Elastic net mixing parameter (only used if penalty='elasticnet')
    'l1_ratio': uniform(0, 1)
}

# --- 6. RANDOMIZED SEARCH CV SETUP AND EXECUTION ---
random_search = RandomizedSearchCV(
    estimator=log_reg,
    param_distributions=param_distributions,
    n_iter=50,             # Number of different combinations to try
    cv=5,                  # 5-fold cross-validation
    scoring='roc_auc',     # Optimize for ROC AUC
    verbose=1,
    random_state=42,
    n_jobs=-1
)

print("Starting Randomized Search for Logistic Regression (Optimizing for ROC AUC)...")
random_search.fit(X_train, y_train)
print("Randomized Search Complete.")
print("-" * 50)


# --- 7. FINAL MODEL EVALUATION ---
best_log_reg = random_search.best_estimator_

# A. Print Best Parameters and ROC AUC Score
print("Best Hyperparameters Found:")
print(random_search.best_params_)

print("\nBest CV ROC AUC Score:", random_search.best_score_)
print("-" * 50)


# B. Evaluate on the Test Set
print("Evaluating Final Logistic Regression Model on Test Set...")

# Get probability predictions (needed for ROC AUC)
y_proba = best_log_reg.predict_proba(X_test)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_proba)
print(f"Test Set ROC AUC: {test_roc_auc:.4f}")
results['Logistic Regression'] = float(test_roc_auc)

# Get hard predictions (needed for Classification Report)
y_pred = best_log_reg.predict(X_test)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))


# C. Confusion Matrix (Visualizing true positives/negatives)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print("              Predicted No Churn | Predicted Churn")
print(f"Actual No Churn:      {cm[0, 0]}          |      {cm[0, 1]}")
print(f"Actual Churn:         {cm[1, 0]}           |      {cm[1, 1]}")

Starting Randomized Search for Logistic Regression (Optimizing for ROC AUC)...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Randomized Search Complete.
--------------------------------------------------
Best Hyperparameters Found:
{'C': np.float64(1.988156815341724), 'l1_ratio': np.float64(0.005522117123602399), 'penalty': 'elasticnet'}

Best CV ROC AUC Score: 0.6059807156117595
--------------------------------------------------
Evaluating Final Logistic Regression Model on Test Set...
Test Set ROC AUC: 0.5899

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.83      0.60      0.70      1092
           1       0.24      0.51      0.32       271

    accuracy                           0.58      1363
   macro avg       0.53      0.55      0.51      1363
weighted avg       0.71      0.58      0.62      1363

Confusion Matrix:
              Predicted No Churn | Predicted Churn
Actual No Churn:      654          

##Evaluation

In [10]:
for model, roc_auc in results.items():
    print(f"{model} ROC AUC: {roc_auc:.4f}")


XGBoost ROC AUC: 0.9964
Random Forest ROC AUC: 0.9961
Logistic Regression ROC AUC: 0.5899
