<a href="https://colab.research.google.com/github/Addychauhan/Data-Analysis/blob/main/bank_customer_churn_prediction_using_machine_learning1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Importing the required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# 2. Loading the Dataset

In [None]:
data = pd.read_csv("/content/Churn Modeling.csv")

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
# Check missing values
data.isnull().sum()

In [None]:
#Checking duplicates
data.duplicated().sum()

# 3. Data Cleaning

In [None]:
# Drop non-informative columns
data.drop(columns=['RowNumber', 'CustomerId', 'Surname'], inplace=True)

# 4. Exploratory Data Analysis

**Target Distribution**

In [None]:

sns.countplot(x='Exited', data=data)
plt.title("Churn Distribution")
plt.show()

**Categorical vs Target**

In [None]:
sns.countplot(x='Geography', hue='Exited', data=data)
plt.title("Geography vs Churn")
plt.show()

**Gender vs Churn**

In [None]:
sns.countplot(x='Gender', hue='Exited', data=data)
plt.title("Gender vs Churn")
plt.show()

**Numerical Distributions**

In [None]:
data[['CreditScore','Age','Balance','EstimatedSalary']].hist(
    figsize=(10,6), bins=30
)
plt.show()

**Numerical vs Churn**

Using boxplots, I compared Age distributions for churned and retained customers. Churned customers show a significantly higher median age, indicating age as an important churn driver.

In [None]:
for col in ['Age','Balance','CreditScore']:
    sns.boxplot(x='Exited', y=col, data=data)
    plt.title(f"{col} vs Churn")
    plt.show()

**Outlier Detection**

In [None]:
num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=data[num_cols])
plt.xticks(rotation=45)
plt.title("Outlier Detection using Boxplot")
plt.show()

**Correlation Heatmap**

In [None]:
numeric_data = data.select_dtypes(include=['int64', 'float64'])

plt.figure(figsize=(10,6))
sns.heatmap(numeric_data.corr(), annot=True, cmap='RdBu', center=0)
plt.title("Correlation Heatmap")
plt.show()

# 5. Encoding Categorical Variables
Converts categorical columns into numerical

Example:

Geography → Geography_Germany, Geography_Spain

drop_first=True avoids dummy variable trap

In [None]:
data = pd.get_dummies(
    data,
    columns=['Geography', 'Gender'],
    drop_first=True,
    dtype=int
)

In [None]:
data

# 6. Feature–Target Split

In [None]:
X = data.drop('Exited', axis=1)
y = data['Exited']

# 7. Feature Scaling
Feature Scaling essential for ML algorithms that calculate the distances between data. if not scaled the features with high value range starts dominating when calculating dstances.
The ML Models that requires feature scaling are Logistic regression, KNN, Neurak Networks, SVM, Linear Regression and the ML Models which do not requires feature scaling are mostly Non=linear ML algorithms like Decision Tree, AdaBoost, Random Forest, Naive Baise

In [None]:
 from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

num_cols = [
    'CreditScore','Age','Tenure','Balance',
    'NumOfProducts','EstimatedSalary'
]


X[num_cols] = scaler.fit_transform(X[num_cols])

# 8. Train–Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y      #“I used stratified sampling to maintain churn class distribution across train and test sets.”
)

# 9. Build Models

**Logistic Regression Model**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
lr = LogisticRegression(class_weight='balanced', max_iter=1000)   #Initializes Logistic Regression model
lr.fit(X_train, y_train)                                          #Trains model using scaled training data
y_pred_lr = lr.predict(X_test)                                    #Predicts churn for test data

In [None]:
accuracy_score(y_test, y_pred_lr)                                 #Calculates accuracy of model

In [None]:
roc_auc_score(y_test, y_pred_lr)

In [None]:
print(classification_report(y_test, y_pred_lr))                   #Generates classification report  or Shows precision, recall, F1-score

**Decision Tree Model**
if we will not scaled Data, Decision Tree Model can handle the unscaled data

In [None]:
dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)                                     #Predicts churn for test data

In [None]:
accuracy_score(y_test, y_pred_dt)                                  #Calculates accuracy of model

In [None]:
roc_auc_score(y_test, y_pred_dt)

In [None]:
print(classification_report(y_test, y_pred_dt))

**Random Forest Model**

In [None]:
rf = RandomForestClassifier(                                          #n_estimators=100 → 100 decision trees
    n_estimators=200,                                                 #Random Forest reduces overfitting
    class_weight='balanced',
    random_state=42
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)                                         #Predicts churn

In [None]:
y_pred_rf

In [None]:
# from sklearn.model_selection import cross_val_score

# scores = cross_val_score(rf, X, y, cv=5, scoring='recall')
# scores.mean()

In [None]:
# from sklearn.pipeline import Pipeline

# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('model', RandomForestClassifier())
# ])
# pipeline.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, y_pred_rf)

In [None]:
roc_auc_score(y_test, y_pred_rf)

In [None]:
# roc_auc_score(y_test, y_prob)

In [None]:
print(classification_report(y_test, y_pred_rf))

**KNNeighbour Model**

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_knn)

In [None]:
roc_auc_score(y_test, y_pred_knn)

In [None]:
print(classification_report(y_test, y_pred_knn))

# 10. Model Evaluation

In [None]:
# def evaluate_model(name, y_test, y_pred):
#     print(f"\n{name}")
#     print("-"*50)
#     print("Accuracy:", accuracy_score(y_test, y_pred))
#     print(classification_report(y_test, y_pred))

In [None]:
# evaluate_model("Logistic Regression", y_test, y_pred_lr)
# evaluate_model("Decision Tree", y_test, y_pred_dt)
# evaluate_model("Random Forest", y_test, y_pred_rf)
# evaluate_model("KNN", y_test, y_pred_knn)

In [None]:
cm = confusion_matrix(y_test, y_pred_lr)
cm

In [None]:
models = {
    "Logistic Regression": y_pred_lr,
    "Decision Tree": y_pred_dt,
    "Random Forest": y_pred_rf,
    "KNN": y_pred_knn
}

for name, pred in models.items():
    cm = confusion_matrix(y_test, pred)
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title(name)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
y_prob = rf.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_prob)

In [None]:
importances = rf.feature_importances_

In [None]:
# 12. FEATURE IMPORTANCE (RANDOM FOREST)


importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

importance.head(10)

In [None]:
importances_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

In [None]:
importances_df = importances_df.sort_values('Importance', ascending=False)

In [None]:
importances_df.head(10)

In [None]:
# 13. Feature Importance Plot

sns.barplot(
    x='Importance',
    y='Feature',
    data=importance.head(10)
)
plt.title("Top 10 Important Features")
plt.show()

In [None]:
#Saving Model
import joblib
feature_order = X_train.columns.tolist() # Define feature_order
joblib.dump(rf, 'churn_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(feature_order,'feature_order.pkl')

In [None]:
#Loading Model
model=joblib.load('scaler.pkl')
model=joblib.load('churn_model.pkl')

In [None]:
new_customer = pd.DataFrame([{
    'CreditScore': 600,
    'Age': 45,
    'Tenure': 3,
    'Balance': 120000,
    'NumOfProducts': 2,
    'HasCrCard': 1,
    'IsActiveMember': 0,
    'EstimatedSalary': 50000,
    'Geography_Germany': 1,
    'Geography_Spain': 0,
    'Gender_Male': 1
}])


In [None]:
# Building pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # Corrected import

num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance','NumOfProducts','EstimatedSalary']
cat_cols = ['HasCrCard', 'IsActiveMember',
            'Geography_Germany', 'Geography_Spain', 'Gender_Male']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', 'passthrough', cat_cols)
])

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier()) # Changed to RandomForestClassifier()
])


In [None]:
#Splitting
x_train_pipeline, x_test_pipeline, y_train_pipeline, y_test_pipeline = train_test_split(X, y,
    test_size=0.2,
    random_state=42, stratify=y)

In [None]:
pipeline.fit(x_train_pipeline, y_train_pipeline)

In [None]:
pipeline.predict(x_test_pipeline)

In [None]:
joblib.dump(pipeline, "churn_pipeline.pkl")

In [None]:
pipeline = joblib.load("churn_pipeline.pkl")
pipeline.predict(new_customer)

In [None]:
import joblib
load_model=joblib.load('churn_model.pkl')

In [None]:
load_model.score(x_test_pipeline, y_test_pipeline)

In [None]:
prediction = pipeline.predict(new_customer)
print(f"The prediction for the new customer is: {prediction[0]}")

In [None]:
#New Customer Churn Prediction which gives output 0
new_customer1 = pd.DataFrame([
    {
        'CreditScore': 650,
        'Age': 30,
        'Tenure': 7,
        'Balance': 60000,
        'NumOfProducts': 1,
        'HasCrCard': 1,
        'IsActiveMember': 1, # Changed from 0 to 1
        'EstimatedSalary': 70000,
        'Geography_Germany': 0, # Changed from 1 to 0 (assuming France)
        'Geography_Spain': 0,
        'Gender_Male': 0 # Changed from 1 to 0
    }
])

prediction_new_customer1 = pipeline.predict(new_customer1)
print(f"The prediction for new_customer1 is: {prediction_new_customer1[0]}")

In [None]:
prediction = pipeline.predict(new_customer1)
print(f"The prediction for the new customer is: {prediction[0]}")

I built an end-to-end churn prediction system including EDA, encoding, scaling, and multiple models. Random Forest performed best. I used stratified splitting, feature scaling, saved the model and scaler, and handled real-time predictions correctly
