In [37]:

!apt install git

# Change to your working directory.
%cd /content

# Clone the shared GitHub repository.
!git clone https://github.com/Adamsomondi/BANK-CHURN-PREDICTION.git

# Move into the repo.
%cd BANK-CHURN-PREDICTION

# Checks the contents of the repository.
!ls -la

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
/content
fatal: destination path 'BANK-CHURN-PREDICTION' already exists and is not an empty directory.
/content/BANK-CHURN-PREDICTION
total 536
drwxr-xr-x 3 root root   4096 Apr 10 15:24 .
drwxr-xr-x 1 root root   4096 Apr 10 15:33 ..
-rw-r--r-- 1 root root 508305 Apr 10 15:04 DataAnalysis.ipynb
drwxr-xr-x 8 root root   4096 Apr 10 15:34 .git
-rw-r--r-- 1 root root     22 Apr 10 15:04 .gitignore
-rw------- 1 root root  13828 Apr 10 15:33 Modelling.ipynb
-rw-r--r-- 1 root root    523 Apr 10 15:04 README.md


**DATA MODELLING**

In [38]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Load the dataset
cleanedfile_id = "1-nx-ESuJAJxHoDfZaZgk56AuwVsY7SDD"
cleanedfile_url = f"https://drive.google.com/uc?id={cleanedfile_id}"
df = pd.read_csv(cleanedfile_url)

# Make sure the target column is binary
if df['Exited'].dtype not in [int, 'category']:
    df['Exited'] = (df['Exited'] > 0).astype(int)

# Separate features and target
X = df.drop(columns=['Exited'])
y = df['Exited']

# Scale features (important for Logistic Regression!)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Set up hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],         # Regularization strength (smaller = stronger regularization)
    'solver': ['liblinear', 'lbfgs'],    # Solvers the method the robot uses to learn
    'penalty': ['l2']                    # is like a rule that helps the robot avoid overfitting
}
# Initialize the model
log_reg = LogisticRegression(random_state=45, max_iter=1000)

# Perform grid search
grid = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# Best model
best_model = grid.best_estimator_
print("Best Parameters:", grid.best_params_)

# Make predictions with the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Best Parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.999

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1607
           1       1.00      1.00      1.00       393

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Confusion Matrix:
 [[1606    1]
 [   1  392]]


In [39]:
from sklearn.ensemble import RandomForestClassifier

# Set up hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],   # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],   # Maximum depth of each tree
    'min_samples_split': [2, 5, 10]    # Minimum number of samples to split an internal node
}

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=45)

# Perform GridSearchCV for hyperparameter tuning
grid_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)

# Best model after tuning
best_rf_model = grid_rf.best_estimator_
print("Best Random Forest Parameters:", grid_rf.best_params_)

# Make predictions with the best model
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate the Random Forest model
print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred_rf))

Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Random Forest - Accuracy: 0.999

Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1607
           1       1.00      1.00      1.00       393

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Confusion Matrix (Random Forest):
 [[1606    1]
 [   1  392]]


In [40]:
# Comparing accuracy scores
logreg_accuracy = accuracy_score(y_test, y_pred)
rf_accuracy = accuracy_score(y_test, y_pred)

print("Logistic Regression Accuracy:", logreg_accuracy)
print("Random Forest Accuracy:", rf_accuracy)

# In this case, we can use the model with the higher accuracy (or other metrics like F1-score) as the best model
if logreg_accuracy > rf_accuracy:
    print("Logistic Regression is the best model based on accuracy.")
else:
    print("Random Forest is the best model based on accuracy.")


Logistic Regression Accuracy: 0.999
Random Forest Accuracy: 0.999
Random Forest is the best model based on accuracy.


In [41]:
# Comparison rationale
print("\nRationale for Model Selection:")

# Compare accuracies
print(f"\nLogistic Regression Accuracy: {logreg_accuracy}")
print(f"Random Forest Accuracy: {rf_accuracy}")

# Compare based on F1-score (harmonic mean of precision and recall)
logreg_f1_score = classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score']
rf_f1_score = classification_report(y_test, y_pred_rf, output_dict=True)['weighted avg']['f1-score']

print(f"Logistic Regression F1-Score: {logreg_f1_score}")
print(f"Random Forest F1-Score: {rf_f1_score}")

# Rationalize the model choice
if rf_accuracy > logreg_accuracy and rf_f1_score > logreg_f1_score:
    print("\nRandom Forest is the better model based on accuracy and F1-score.")
elif logreg_accuracy > rf_accuracy and logreg_f1_score > rf_f1_score:
    print("\nLogistic Regression is the better model based on accuracy and F1-score.")
else:
    print("\nBoth models perform similarly. Further analysis, such as cross-validation, could be considered.")


Rationale for Model Selection:

Logistic Regression Accuracy: 0.999
Random Forest Accuracy: 0.999
Logistic Regression F1-Score: 0.999
Random Forest F1-Score: 0.999

Both models perform similarly. Further analysis, such as cross-validation, could be considered.


In [42]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Load the dataset
cleanedfile_id = "1-nx-ESuJAJxHoDfZaZgk56AuwVsY7SDD"
cleanedfile_url = f"https://drive.google.com/uc?id={cleanedfile_id}"
df = pd.read_csv(cleanedfile_url)

# Preprocess the target column to be binary (0 or 1)
if df['Exited'].dtype not in [int, 'category']:
    df['Exited'] = (df['Exited'] > 0).astype(int)

# Separate features and target
X = df.drop(columns=['Exited'])
y = df['Exited']

# Scale features (important for Logistic Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize models
log_reg = LogisticRegression(random_state=42, max_iter=1000)
rf_classifier = RandomForestClassifier(random_state=45)

# Perform cross-validation for Logistic Regression
log_reg_cv_scores = cross_val_score(log_reg, X_scaled, y, cv=5, scoring='accuracy')

# Perform cross-validation for Random Forest
rf_cv_scores = cross_val_score(rf_classifier, X_scaled, y, cv=5, scoring='accuracy')

# Print cross-validation results for Logistic Regression
print("Logistic Regression Cross-Validation Results:")
print(f"Cross-validation scores (Accuracy) for each fold: {log_reg_cv_scores}")
print(f"Mean accuracy: {log_reg_cv_scores.mean()}")
print(f"Standard deviation: {log_reg_cv_scores.std()}")

# Print cross-validation results for Random Forest
print("\nRandom Forest Cross-Validation Results:")
print(f"Cross-validation scores (Accuracy) for each fold: {rf_cv_scores}")
print(f"Mean accuracy: {rf_cv_scores.mean()}")
print(f"Standard deviation: {rf_cv_scores.std()}")

# Evaluate models using the mean accuracy score from cross-validation
print("\nModel Evaluation Based on Cross-Validation Results:")
if log_reg_cv_scores.mean() > rf_cv_scores.mean():
    print("\nLogistic Regression is the better model based on mean cross-validation accuracy.")
else:
    print("\nRandom Forest is the better model based on mean cross-validation accuracy.")

Logistic Regression Cross-Validation Results:
Cross-validation scores (Accuracy) for each fold: [0.995 1.    1.    1.    0.998]
Mean accuracy: 0.9986
Standard deviation: 0.0019595917942265445

Random Forest Cross-Validation Results:
Cross-validation scores (Accuracy) for each fold: [0.995 1.    1.    1.    0.998]
Mean accuracy: 0.9986
Standard deviation: 0.0019595917942265445

Model Evaluation Based on Cross-Validation Results:

Random Forest is the better model based on mean cross-validation accuracy.


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!ls -la "/content/drive/My Drive/Colab Notebooks"
!find "/content/drive/My Drive" -name "Modelling.ipynb"
!cp "/content/drive/My Drive/Colab Notebooks/Modelling.ipynb" /content/BANK-CHURN-PREDICTION/

!ls -la "/content/BANK-CHURN-PREDICTION/"

In [46]:
# Git commands to push the changes adding your changes
!git config --global user.name "Adamsomondi" #replace your own github name.
!git config --global user.email "mustafajohnson123@gmail.com" #replace your own github gmail.
!git add .

 # Commit your changes
!git commit -m "Data Modelling and Evaluation" #write a message "finished data Cleaning/added eda analysis"


On branch N-Modelling
nothing to commit, working tree clean
