##Binary Classification Project model

In [137]:
#imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV




In [138]:
df = pd.read_csv('bank.csv')
df.Gender.value_counts()

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,5457
Female,4543


In [139]:
df['Exited'].value_counts() #might be unbalanced, will use some other imports to fix this

Unnamed: 0_level_0,count
Exited,Unnamed: 1_level_1
0,7963
1,2037


dropping columns

In [140]:
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

create dummy variables

In [141]:
df = pd.get_dummies(df, columns=['Geography', 'Gender'])

Model creation

In [142]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix


In [143]:
#create train and test data
X = df.drop('Exited', axis=1)
y = df['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [144]:
#resample
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [157]:
#random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix


param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

#initialize Random Forest and GridSearchCV
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1', n_jobs=-1)
grid_rf.fit(X_resampled, y_resampled)

#best model
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

#evaluate random forest
print("Best Random Forest Params:", grid_rf.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))



Best Random Forest Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Confusion Matrix:
 [[1484  123]
 [ 168  225]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.92      0.91      1607
           1       0.65      0.57      0.61       393

    accuracy                           0.85      2000
   macro avg       0.77      0.75      0.76      2000
weighted avg       0.85      0.85      0.85      2000



In [158]:
#rfc accuracy
accuracy = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", accuracy)

Accuracy: 0.8545


In [151]:
#xgboost
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

#initialize gradient boosting and GridSearchCV
grid_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=3, scoring='f1', n_jobs=-1)
grid_gb.fit(X_resampled, y_resampled)

#best model
best_gb = grid_gb.best_estimator_
y_pred_gb = best_gb.predict(X_test)

#evaluate Gradient Boosting
print("Best Gradient Boosting Params:", grid_gb.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gb))




Best Gradient Boosting Params: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300}

Confusion Matrix:
 [[1469  138]
 [ 175  218]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90      1607
           1       0.61      0.55      0.58       393

    accuracy                           0.84      2000
   macro avg       0.75      0.73      0.74      2000
weighted avg       0.84      0.84      0.84      2000



In [156]:
#xgb accuracy
accuracy = accuracy_score(y_test, y_pred_gb)
print("Accuracy:", accuracy)

Accuracy: 0.8435


In [153]:
#logistic regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),  # Scale features
    ('lr', LogisticRegression(random_state=42))  # Logistic Regression
])

# Define hyperparameter grid
param_grid_lr = {
    'lr__C': [0.01, 0.1, 1, 10],
    'lr__penalty': ['l1', 'l2'],  # Lasso and Ridge regularization
    'lr__solver': ['liblinear', 'saga']  # Solvers for l1 and l2
}

# Initialize GridSearchCV
grid_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=3, scoring='f1', n_jobs=-1)
grid_lr.fit(X_resampled, y_resampled)

# Best model
best_lr = grid_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test)

# Evaluate Logistic Regression
print("Best Logistic Regression Params:", grid_lr.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


Best Logistic Regression Params: {'lr__C': 0.01, 'lr__penalty': 'l2', 'lr__solver': 'saga'}

Confusion Matrix:
 [[1425  182]
 [ 218  175]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88      1607
           1       0.49      0.45      0.47       393

    accuracy                           0.80      2000
   macro avg       0.68      0.67      0.67      2000
weighted avg       0.79      0.80      0.80      2000



In [155]:
#logistic accuracy
accuracy = accuracy_score(y_test, y_pred_lr)
print("Accuracy:", accuracy)

Accuracy: 0.8
