# CUSTOMER CHURN PREDICTION
## To Do:
### Develop a model to predict customer churn for a subscription basedservice or business. Use historical customer data, including features like usage behavior and customer demographics, and try algorithms like Logistic Regression, Random Forests, or Gradient Boosting to predict churn.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
data  = pd.read_csv("C:/Users/CHARAN/Downloads/Bank Customer Churn Prediction Dataset/Churn_Modelling.csv")

In [3]:
# Feature and target separation
X = data.drop(columns=['Exited'])  # Features
y = data['Exited']  # Target variable

In [4]:
# Identify categorical and numerical columns
categorical_cols = ['Geography', 'Gender']
numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

In [5]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [6]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [8]:
# Create pipelines for different models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

In [9]:
# Loop through the models, create pipeline, and train them
for name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Fit the model
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]
    
    # Evaluate the model
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"AUC-ROC: {roc_auc_score(y_test, y_proba):.4f}")
    print("="*60)

Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.55      0.20      0.29       393

    accuracy                           0.81      2000
   macro avg       0.69      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000

Confusion Matrix:
 [[1543   64]
 [ 314   79]]
AUC-ROC: 0.7789
Model: Random Forest
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.46      0.57       393

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000

Confusion Matrix:
 [[1545   62]
 [ 211  182]]
AUC-ROC: 0.8563
Model: Gradient Boosting
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.74      0.47      0

In [10]:
# Hyperparameter tuning for the best model (e.g., Random Forest)
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(Pipeline(steps=[('preprocessor', preprocessor),
                                           ('classifier', RandomForestClassifier(random_state=42))]),
                           param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print(f"Best AUC-ROC: {grid_search.best_score_:.4f}")

Best parameters found: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best AUC-ROC: 0.8583


In [11]:
# Best model evaluation
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
y_proba_best = best_model.predict_proba(X_test)[:, 1]

print("Final Model Evaluation:")
print(classification_report(y_test, y_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba_best):.4f}")

Final Model Evaluation:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1607
           1       0.76      0.44      0.56       393

    accuracy                           0.86      2000
   macro avg       0.82      0.70      0.74      2000
weighted avg       0.85      0.86      0.85      2000

Confusion Matrix:
 [[1554   53]
 [ 221  172]]
AUC-ROC: 0.8678


In [13]:
# Open a file to save the results
with open("model_results.txt", "w") as f:
    for name, model in models.items():
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Fit the model
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)
        y_proba = clf.predict_proba(X_test)[:, 1]

        # Evaluate the model
        f.write(f"Model: {name}\n")
        f.write(classification_report(y_test, y_pred))
        f.write("Confusion Matrix:\n")
        f.write(f"{confusion_matrix(y_test, y_pred)}\n")
        f.write(f"AUC-ROC: {roc_auc_score(y_test, y_proba):.4f}\n")
        f.write("="*60 + "\n")

# Hyperparameter tuning for the best model (e.g., Random Forest)
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(Pipeline(steps=[('preprocessor', preprocessor),
                                           ('classifier', RandomForestClassifier(random_state=42))]),
                           param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
y_proba_best = best_model.predict_proba(X_test)[:, 1]

# Save the final evaluation
with open("model_results.txt", "a") as f:
    f.write("Best parameters found:\n")
    f.write(f"{grid_search.best_params_}\n")
    f.write(f"Best AUC-ROC: {grid_search.best_score_:.4f}\n\n")
    f.write("Final Model Evaluation:\n")
    f.write(classification_report(y_test, y_pred_best))
    f.write("Confusion Matrix:\n")
    f.write(f"{confusion_matrix(y_test, y_pred_best)}\n")
    f.write(f"AUC-ROC: {roc_auc_score(y_test, y_proba_best):.4f}\n")