In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Loading the data
df = pd.read_csv("C:/Users/npanousieris/OneDrive - Department of Education/Power BI alex/Churn Prediction Bank Yanis/churn raw data.csv")

# Preserve customer ID for later
customer_id = df['CustomerId']

# Drop unnecessary columns
df2 = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

# Apply one-hot encoding and convert to integer
df_model = pd.get_dummies(df2, columns=['Geography', 'Gender', 'HasCrCard', 'IsActiveMember'])
df_model = df_model.astype(int)

# Separate features (X) and target (y)
X = df_model.drop('Exited', axis=1)
y = df_model['Exited']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17, test_size=0.2)

# Train RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Predictions on test data
y_pred = rf.predict(X_test)

# Model evaluation
print("Model Score:", rf.score(X_test, y_test))
print(classification_report(y_test, y_pred))

# Feature importance analysis
features = pd.DataFrame(rf.feature_importances_, index=X.columns, columns=["Importance"])
print(features.sort_values(by="Importance", ascending=False))

# Add Churn Prediction and Probability to Churn columns
df['Churn Prediction'] = rf.predict(X)  # Add binary churn prediction (0 or 1)
df['Probability to Churn'] = rf.predict_proba(X)[:, 1]  # Add probability for class 1 (churn)

# Restore Customer ID for clarity
df_model['CustomerId'] = customer_id

df.head()


Model Score: 0.857
              precision    recall  f1-score   support

           0       0.87      0.97      0.91      1586
           1       0.77      0.44      0.56       414

    accuracy                           0.86      2000
   macro avg       0.82      0.70      0.74      2000
weighted avg       0.85      0.86      0.84      2000

                   Importance
Age                  0.244359
EstimatedSalary      0.143909
CreditScore          0.143273
Balance              0.136548
NumOfProducts        0.124661
Tenure               0.086579
IsActiveMember_0     0.021753
IsActiveMember_1     0.021256
Geography_Germany    0.019069
Geography_France     0.010992
HasCrCard_1          0.009977
HasCrCard_0          0.009897
Gender_Male          0.009668
Geography_Spain      0.009166
Gender_Female        0.008894


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Churn Prediction,Probability to Churn
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,0,0.23
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,0,0.15
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,0.99
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,0.08
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,0.01


In [10]:
df.to_csv('Bank Churn Prediction Output.csv')

In [11]:
# Add Churn Prediction and Probability to Churn columns to df_Model
df_model['Churn Prediction'] = rf.predict(X)  # Add binary churn prediction (0 or 1)
df_model['Probability to Churn'] = rf.predict_proba(X)[:, 1]  # Add probability for class 1 (churn)
df_model.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,HasCrCard_0,HasCrCard_1,IsActiveMember_0,IsActiveMember_1,CustomerId,Churn Prediction,Probability to Churn
0,619,42,2,0,1,101348,1,1,0,0,1,0,0,1,0,1,15634602,0,0.23
1,608,41,1,83807,1,112542,0,0,0,1,1,0,1,0,0,1,15647311,0,0.15
2,502,42,8,159660,3,113931,1,1,0,0,1,0,0,1,1,0,15619304,1,0.99
3,699,39,1,0,2,93826,0,1,0,0,1,0,1,0,1,0,15701354,0,0.08
4,850,43,2,125510,1,79084,0,0,0,1,1,0,0,1,0,1,15737888,0,0.01


In [12]:
df_model.to_csv('Bank Churn Prediction Output df_model.csv')

In [15]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Churn Prediction',
       'Probability to Churn'],
      dtype='object')

In [16]:
df_model.columns

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'EstimatedSalary', 'Exited', 'Geography_France', 'Geography_Germany',
       'Geography_Spain', 'Gender_Female', 'Gender_Male', 'HasCrCard_0',
       'HasCrCard_1', 'IsActiveMember_0', 'IsActiveMember_1', 'CustomerId',
       'Churn Prediction', 'Probability to Churn'],
      dtype='object')

In [17]:
# Testing the validity of our model

import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# Assume df_model is already pre-processed and includes 'Exited' as the target variable

# Features and target selection
X = df_model.drop(columns=['Exited', 'Churn Prediction', 'Probability to Churn', 'CustomerId'])
y = df_model['Exited']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the random forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8700
Precision: 0.7756
Recall: 0.4675
F1 Score: 0.5833
ROC-AUC: 0.8508

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      2416
           1       0.78      0.47      0.58       584

    accuracy                           0.87      3000
   macro avg       0.83      0.72      0.75      3000
weighted avg       0.86      0.87      0.86      3000


Confusion Matrix:
[[2337   79]
 [ 311  273]]


In [None]:
#Interpretation of Metrics

#The results provided represent the evaluation of your random forest model for predicting churn. Each metric evaluates a different aspect of the model's performance:



#Overall Metrics:

#1. Accuracy: 0.8700 (87%)


#What it means:
#The percentage of total predictions (churn and no churn) that are correct.
#Formula: (True Positives+True Negatives)/Total Samples(\text{True Positives} + \text{True Negatives}) / \text{Total Samples}(True Positives+True Negatives)/Total Samples
#Significance: While it gives an overview, accuracy alone can be misleading if the dataset is imbalanced (e.g., significantly more "non-churn" cases than "churn").

#2. Precision: 0.7756 (77.56%)


#What it means:
#Out of all the customers predicted to churn (positive class, 1), what proportion actually churned.
#Formula: True Positives/(True Positives+False Positives)\text{True Positives} / (\text{True Positives} + \text{False Positives})True Positives/(True Positives+False Positives)
#Significance: High precision means the model makes fewer false positive churn predictions (it avoids labeling non-churners as churners).

#3. Recall: 0.4675 (46.75%)


#What it means:
#Out of all the customers who actually churned, how many did the model correctly identify?
#Formula: True Positives/(True Positives+False Negatives)\text{True Positives} / (\text{True Positives} + \text{False Negatives})True Positives/(True Positives+False Negatives)
#Significance: Low recall means many actual churners are being missed (false negatives).

#4. F1 Score: 0.5833 (58.33%)


#What it means:
#The harmonic mean of precision and recall. It balances these two metrics into one score.
#Formula: (2×Precision×Recall)/(Precision+Recall)(2 \times \text{Precision} \times \text{Recall}) / (\text{Precision} + \text{Recall})(2×Precision×Recall)/(Precision+Recall)
#Significance: F1 is a good indicator when you want a balance between precision and recall.

#5. ROC-AUC: 0.8508 (85.08%)


#What it means:
#Measures how well the model distinguishes between classes across all probability thresholds.
#Higher values indicate better performance. 1.01.01.0 means perfect distinction, 0.50.50.5 is random guessing.
#Significance: High ROC-AUC suggests the model has strong discriminatory power.



#Classification Report:

#This provides a detailed breakdown for each class (0 = no churn, 1 = churn):

#1. Class 0 (Non-churners):


#Precision = 0.88: When the model predicts "no churn," it is correct 88% of the time.
#Recall = 0.97: It correctly identifies 97% of non-churners.
#F1-score = 0.92: Strong performance for the majority class.

#2. Class 1 (Churners):


#Precision = 0.78: When the model predicts "churn," it is correct 77.56% of the time.
#Recall = 0.47: It correctly identifies only 46.75% of actual churners.
#F1-score = 0.58: Indicates room for improvement, especially in recall.

#3. Support:


#The number of actual occurrences in each class:
#Non-churners (Class 0): 2416 samples
#Churners (Class 1): 584 samples



#Confusion Matrix:

#[233779311273]\begin{bmatrix} 2337 & 79 \\ 311 & 273 \end{bmatrix}[2337311​79273​]

#Each value represents:

#True Negatives (TN = 2337): Non-churners correctly classified as non-churners.
#False Positives (FP = 79): Non-churners misclassified as churners.
#False Negatives (FN = 311): Churners misclassified as non-churners.
#True Positives (TP = 273): Churners correctly classified as churners.



#Key Takeaways:

#1. Strengths:


#The model performs well in predicting the majority class (non-churners).
#ROC-AUC suggests good discriminatory power overall.

#2. Weaknesses:


#Recall for churners is low, meaning the model fails to identify many actual churners (false negatives = 311).
#Imbalanced classes may be affecting performance.

#3. Possible Next Steps:


#Focus on improving recall for the minority class (churners) by:
#Trying techniques like oversampling the minority class or undersampling the majority class.
#Using a different threshold for classifying churn based on predicted probabilities.
#Experimenting with model parameters (e.g., increasing n_estimators in random forest) or using alternative models (e.g., gradient boosting).
#Optimize for business impact (e.g., reducing false negatives could be more important for churn management).


In [20]:
# Test only to workout optimal estimators 

# from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Trying different values for n_estimators
estimators = [10, 50, 100, 200, 500]
scores = []

for n in estimators:
    model = RandomForestClassifier(n_estimators=n, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
    scores.append(score)
    print(f"n_estimators: {n}, ROC-AUC: {score:.4f}")


n_estimators: 10, ROC-AUC: 0.8075
n_estimators: 50, ROC-AUC: 0.8440
n_estimators: 100, ROC-AUC: 0.8475
n_estimators: 200, ROC-AUC: 0.8499
n_estimators: 500, ROC-AUC: 0.8498


In [24]:
# Loading unseen data
df_unseen = pd.read_csv("C:/Users/npanousieris/OneDrive - Department of Education/Power BI alex/Churn Prediction Bank Yanis/new unseen data.csv")

# Preserve customer ID for later
customer_id = df_unseen['CustomerId']

# Drop unnecessary columns
df_unseen2 = df_unseen.drop(columns=['RowNumber', 'CustomerId', 'Surname','Exited'])

# Apply one-hot encoding and convert to integer
df_model_unseen = pd.get_dummies(df_unseen2, columns=['Geography', 'Gender', 'HasCrCard', 'IsActiveMember'])
df_model_unseen = df_model_unseen.astype(int)

# Add Churn Prediction and Probability to Churn columns
df_unseen['Churn Prediction'] = rf.predict(df_model_unseen)  # Add binary churn prediction (0 or 1)
df_unseen['Probability to Churn'] = rf.predict_proba(df_model_unseen)[:, 1]  # Add probability for class 1 (churn)

# Restore Customer ID for clarity
df_model_unseen['CustomerId'] = customer_id

df_unseen.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Churn Prediction,Probability to Churn
0,1,15634638,AP Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,0,0.23
1,2,15647347,AP Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,0,0.15
2,3,15619340,AP Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,0.99
3,4,15701390,AP Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,0.08
4,5,15737924,AP Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,0.01


In [28]:
df_unseen.to_csv('Bank Churn Prediction UnseenData Output.csv')