# Customer Churn Prediction Analysis

This notebook analyzes customer churn data and builds predictive models to identify customers likely to churn.

## 1. Import Libraries

In [None]:
# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, roc_auc_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import missingno as msno
import pickle
# %matplotlib inline

pd.set_option('display.max_columns', 100)

## 2. Load and Explore Data

In [None]:
# Load dataset
dataset = pd.read_csv('Churn_Modelling.csv')

# Display first few rows
print("First 5 rows:")
dataset.head()

In [None]:
# Display last few rows
print("Last 5 rows:")
dataset.tail()

In [None]:
# Dataset information
print("Dataset Info:")
dataset.info()

In [None]:
# Check for missing values
print("Missing Values:")
print(dataset.isnull().sum())

## 3. Data Visualization

In [None]:
# Target variable distribution
plt.figure(figsize=(8,6))
sns.countplot(x='Exited', data=dataset)
plt.title('Distribution of Churn (Exited)')
plt.show()

In [None]:
# Remove unnecessary columns
dataset = dataset.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
print("Dataset after removing unnecessary columns:")
dataset.head()

### Identify Data Types

In [None]:
# Analyze unique values in each column
for column in dataset.columns:
    unique_values = np.unique(dataset[column].fillna('0'))
    nr_values = len(unique_values)
    if nr_values <= 12:
        print(f'Number of values in {column}: {nr_values} -> {unique_values}')
    else:
        print(f'Number of values in {column}: {nr_values}')

### Plot Categorical Data

In [None]:
# Plot categorical features against churn
categorical_features = ['Geography', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Age']

for feature in categorical_features:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=feature, data=dataset, hue='Exited')
    plt.title(f'Churn Distribution by {feature}')
    plt.xticks(rotation=45)
    plt.show()

### Plot Numerical Data

In [None]:
# Box plots for numerical variables
numerical_vars = ['CreditScore', 'EstimatedSalary', 'Balance']

for var in numerical_vars:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=dataset[var])
    plt.title(f'Box Plot of {var} (Outlier Detection)')
    plt.show()

## 4. Data Preprocessing

In [None]:
# Label encode Gender
le = LabelEncoder()
dataset['Gender'] = le.fit_transform(dataset['Gender'])
print("Gender encoding: 0 = Female, 1 = Male")
print(dataset[['Gender']].head())

In [None]:
# One-hot encode Geography
dataset_encoded = pd.get_dummies(dataset, columns=['Geography'], drop_first=True)
print("Dataset after one-hot encoding:")
dataset_encoded.head()

## 5. Feature Selection

In [None]:
# Split data first to avoid data leakage
X = dataset_encoded.drop('Exited', axis=1)
y = dataset_encoded['Exited']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

In [None]:
# Chi-squared feature selection
X_train_abs = X_train.abs()

sf = SelectKBest(score_func=chi2, k='all')
sf_fit = sf.fit(X_train_abs, y_train)

# Create dataframe with feature scores
feature_scores_df = pd.DataFrame({
    'feature': X_train.columns,
    'scores': sf_fit.scores_
})

# Sort by score descending and take top 20
feature_scores_df = feature_scores_df.sort_values(by='scores', ascending=False).head(20)

# Plot feature scores
plt.figure(figsize=(12, 8))
sns.barplot(x='scores', y='feature', data=feature_scores_df, color='blue')
sns.set_style('whitegrid')
plt.ylabel('Feature', fontsize=14)
plt.xlabel('Chi-squared Score', fontsize=14)
plt.title('Top 20 Features by Chi-Squared Score', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Random Forest feature importance
rf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
rf.fit(X_train, y_train)
importances = rf.feature_importances_

indices = np.argsort(rf.feature_importances_)[::-1]

print("Feature Importance Ranking:")
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, X_train.columns[indices[f]], importances[indices[f]]))

In [None]:
# Select features using Random Forest
sfm = SelectFromModel(rf, threshold='median', prefit=True)
print('Number of features before selection: {}'.format(X_train.shape[1]))
n_features = sfm.transform(X_train).shape[1]
print('Number of features after selection: {}'.format(n_features))
selected_vars = list(X_train.columns[sfm.get_support()])
print("Selected features:", selected_vars)

## 6. Create Feature-Engineered Dataset

In [None]:
# Create new dataset with selected features
datasetFe = dataset_encoded[selected_vars + ['Exited']]
print("Feature-engineered dataset:")
datasetFe.head()

In [None]:
# Analyze unique values in selected features
print("\nUnique values in selected features:")
for column in datasetFe.columns:
    unique_values = np.unique(datasetFe[column].fillna('0'))
    nr_values = len(unique_values)
    if nr_values <= 12:
        print(f'{column}: {nr_values} values -> {unique_values}')
    else:
        print(f'{column}: {nr_values} values')

## 7. Handle Class Imbalance with SMOTE

In [None]:
# Update X and y with selected features
X = datasetFe.drop('Exited', axis=1)
y = datasetFe['Exited']

# Split again with selected features
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

print("Class distribution before SMOTE:")
print(y_train.value_counts())
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_bal).value_counts())

In [None]:
# Create balanced DataFrame for visualization
balanced_df = pd.DataFrame(X_train_bal, columns=X_train.columns)
balanced_df['Exited'] = y_train_bal

# Plot categorical features after balancing
categorical_features = [col for col in ['Tenure', 'IsActiveMember'] if col in balanced_df.columns]

for feature in categorical_features:
    plt.figure(figsize=(8, 6))
    sns.countplot(x=feature, data=balanced_df, hue='Exited')
    plt.title(f"Class Distribution by {feature} (After SMOTE)")
    plt.show()

In [None]:
# Plot numerical features after balancing
numerical_features = [col for col in ['Age', 'CreditScore', 'Balance', 'EstimatedSalary'] if col in balanced_df.columns]

for feature in numerical_features:
    plt.figure(figsize=(8, 6))
    sns.kdeplot(data=balanced_df, x=feature, hue='Exited', fill=True, common_norm=False)
    plt.title(f"Distribution of {feature} by Class (After SMOTE)")
    plt.show()

## 8. Model Training and Evaluation

### 8.1 Random Forest Classifier

In [None]:
# Random Forest
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train_bal, y_train_bal)
y_pred_rf = model_rf.predict(X_test_sc)

# Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf, normalize='true')
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=model_rf.classes_)

plt.figure(figsize=(8, 6))
disp_rf.plot(cmap='Blues', values_format=".2f")
plt.title("Random Forest - Confusion Matrix (Percentages)")
plt.show()

# Metrics
print("Random Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=["No Churn", "Churn"]))

### 8.2 Logistic Regression

In [None]:
# Logistic Regression
model_lr = LogisticRegression(max_iter=5000, random_state=42)
model_lr.fit(X_train_bal, y_train_bal)
y_pred_lr = model_lr.predict(X_test_sc)

# Confusion Matrix
cm_lr = confusion_matrix(y_test, y_pred_lr, normalize='true')
disp_lr = ConfusionMatrixDisplay(confusion_matrix=cm_lr, display_labels=model_lr.classes_)

plt.figure(figsize=(8, 6))
disp_lr.plot(cmap='Blues', values_format=".2f")
plt.title("Logistic Regression - Confusion Matrix (Percentages)")
plt.show()

# Metrics
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_lr):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=["No Churn", "Churn"]))

### 8.3 XGBoost Classifier

In [None]:
# XGBoost
from xgboost import XGBClassifier

model_xgb = XGBClassifier(
    eval_metric='logloss',
    random_state=42
)

model_xgb.fit(X_train_bal, y_train_bal)
y_pred_xgb = model_xgb.predict(X_test_sc)

# Confusion Matrix
cm_xgb = confusion_matrix(y_test, y_pred_xgb, normalize="true")
disp_xgb = ConfusionMatrixDisplay(confusion_matrix=cm_xgb, display_labels=model_xgb.classes_)

plt.figure(figsize=(8, 6))
disp_xgb.plot(cmap="Blues")
plt.title("XGBoost - Confusion Matrix")
plt.show()

# Metrics
print("XGBoost Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_xgb):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=["No Churn", "Churn"]))

## 9. Model Comparison

In [None]:
# Compare model performance
models = ['Random Forest', 'Logistic Regression', 'XGBoost']
accuracies = [
    accuracy_score(y_test, y_pred_rf),
    accuracy_score(y_test, y_pred_lr),
    accuracy_score(y_test, y_pred_xgb)
]
roc_aucs = [
    roc_auc_score(y_test, y_pred_rf),
    roc_auc_score(y_test, y_pred_lr),
    roc_auc_score(y_test, y_pred_xgb)
]

comparison_df = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracies,
    'ROC AUC': roc_aucs
})

print("Model Performance Comparison:")
print(comparison_df.round(4))

# Plot comparison
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.barplot(x='Model', y='Accuracy', data=comparison_df)
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.barplot(x='Model', y='ROC AUC', data=comparison_df)
plt.title('Model ROC AUC Comparison')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 10. Save Results and Model

In [None]:
# Choose the best model (based on your evaluation)
best_model = model_rf  # Change this to your preferred model

# Adding predictions back to the original dataset
dataset_encoded['Exited Prediction'] = best_model.predict(scaler.transform(X))
dataset_encoded['Exited Prediction Probability'] = best_model.predict_proba(scaler.transform(X))[:,1]

# Export the data with predictions
dataset_encoded.to_csv("bank_churn_data_with_predictions.csv", index=False)
print("Dataset with predictions saved as 'bank_churn_data_with_predictions.csv'")

# Create feature importance dataframe
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance.to_csv("feature_importance.csv", index=False)
print("Feature importance saved as 'feature_importance.csv'")

# Save the best model
with open('best_churn_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
print("Best model saved as 'best_churn_model.pkl'")

# Save the scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
print("Scaler saved as 'scaler.pkl'")

print("\nAll operations completed successfully!")

## 11. Summary

In [None]:
# Final summary
print("=== ANALYSIS SUMMARY ===\n")
print(f"Original dataset shape: {dataset_encoded.shape}")
print(f"Selected features: {len(selected_vars)}")
print(f"Best model: {type(best_model).__name__}")
print(f"Best model accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Best model ROC AUC: {roc_auc_score(y_test, y_pred_rf):.4f}")

print("\nTop 5 most important features:")
print(feature_importance.head())

print("\nFiles generated:")
print("1. bank_churn_data_with_predictions.csv - Dataset with predictions")
print("2. feature_importance.csv - Feature importance rankings")
print("3. best_churn_model.pkl - Trained model for future predictions")
print("4. scaler.pkl - Scaler for preprocessing new data")