In [252]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from imblearn.over_sampling import SMOTE

In [260]:
data = pd.read_csv('bank_data.csv')

data.info()

# convert to binary values of housing, loan, default
data['housing'] = data['housing'].map({'yes' : 1, 'no' : 0})
data['default'] = data['default'].map({'yes': 1, 'no' :0})
data['loan'] = data['loan'].map({'yes' :1, 'no' :0})


# encode martial type
data['marital_single'] = data['marital'].apply(lambda x:1 if 'single' in x else 0)
data['marital_married'] = data['marital'].apply(lambda x:1 if 'married' in x else 0)
data['marital_divorced'] = data['marital'].apply(lambda x:1 if 'divorced' in x else 0)

# encode education type
data['education_primary'] = data['education'].apply(lambda x:1 if 'primary' in x else 0 )
data['education_secondary'] = data['education'].apply(lambda x:1 if 'secondary' in x else 0 )
data['education_tertiary'] = data['education'].apply(lambda x:1 if 'tertiary' in x else 0 )
data['education_others'] = data['education'].apply(lambda x:1 if 'others' in x else 0 )

# encode contact type
data['contact_cellular'] = data['contact'].apply(lambda x:1 if 'cellular' in x else 0)
data['contact_telephone'] = data['contact'].apply(lambda x:1 if 'telephone' in x else 0)
data['contact_unknown'] = data['contact'].apply(lambda x:1 if 'unknown' in x else 0)

# one hot encode of job type
def job_type(input):
    if input == 'self-employed' or input == 'entrepreneur' or input == 'unemployed' or input == 'housemaid' or input == 'student' or input == 'unknown' :
        return 'other_job'
    else:
        return input
data['job'] = data['job'].apply(job_type)

data['blue-collar'] = data['job'].apply(lambda x:1 if 'blue-collar' in x else 0)
data['management'] = data['job'].apply(lambda x:1 if 'management' in x else 0)
data['technician'] = data['job'].apply(lambda x:1 if 'technician' in x else 0)
data['admin'] = data['job'].apply(lambda x:1 if 'admin' in x else 0)
data['services'] = data['job'].apply(lambda x:1 if 'services' in x else 0)
data['retired'] = data['job'].apply(lambda x:1 if 'retired' in x else 0)
data['other_job'] = data['job'].apply(lambda x:1 if 'other_job' in x else 0)

# one hot encoding of poutcome
data['poutcome_unknown'] = data['poutcome'].apply(lambda x:1 if 'unknown' in x else 0)
data['poutcome_failure'] = data['poutcome'].apply(lambda x:1 if 'failure' in x else 0)
data['poutcome_other'] = data['poutcome'].apply(lambda x:1 if 'other' in x else 0)
data['poutcome_success'] = data['poutcome'].apply(lambda x:1 if 'success' in x else 0)

# encoding of y values
data['y'] = data['y'].map({'yes' :1, 'no' :0})

# one hot encoding of months
data['jan_month'] = data['month'].apply(lambda x:1 if 'jan' in x else 0)
data['feb_month'] = data['month'].apply(lambda x:1 if 'feb' in x else 0)
data['mar_month'] = data['month'].apply(lambda x:1 if 'mar' in x else 0)
data['apr_month'] = data['month'].apply(lambda x:1 if 'apr' in x else 0)
data['may_month'] = data['month'].apply(lambda x:1 if 'may' in x else 0)
data['jun_month'] = data['month'].apply(lambda x:1 if 'jun' in x else 0)
data['jul_month'] = data['month'].apply(lambda x:1 if 'jul' in x else 0)
data['aug_month'] = data['month'].apply(lambda x:1 if 'aug' in x else 0)
data['sep_month'] = data['month'].apply(lambda x:1 if 'sep' in x else 0)
data['oct_month'] = data['month'].apply(lambda x:1 if 'oct' in x else 0)
data['nov_month'] = data['month'].apply(lambda x:1 if 'nov' in x else 0)
data['dec_month'] = data['month'].apply(lambda x:1 if 'dec' in x else 0)


# remove columns
data = data.drop(columns=['education', 'marital', 'contact', 'job', 'month', 'poutcome'])






<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [273]:
# split the dataset into features and target
X = data.drop(columns=['y'])
y = data['y']

# Select best features using SelectKBest
feature_selection = SelectKBest(score_func=mutual_info_classif, k=30)
X_selected = feature_selection.fit_transform(X, y)

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.25, random_state=40 )

# apply the SMOTE to balance  the target variable
smote = SMOTE(random_state=40)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Random Forest model
rf = RandomForestClassifier(random_state=42)

# Hyperparameter grid
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Grid Search
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameters and evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nClassification Report on Test Set:\n", classification_report(y_test, y_pred))


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy: 0.9184187691896942

Classification Report on Test Set:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      9958
           1       0.51      0.51      0.51      1345

    accuracy                           0.88     11303
   macro avg       0.72      0.72      0.72     11303
weighted avg       0.88      0.88      0.88     11303



In [264]:
X = data.drop(columns=['y'])
y = data['y']

# select best features
feature_selection = SelectKBest(score_func = mutual_info_classif, k = 30)
X_selected = feature_selection.fit_transform(X, y)

# split data set
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.25, random_state=40)

# apply the smote to baance target variable
smote = SMOTE(random_state=40)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.fit_transform(X_test)

# Build the neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1 if len(np.unique(y)) == 2 else len(np.unique(y)), activation='sigmoid' if len(np.unique(y)) == 2 else 'softmax')
])

# Compile the model
model.compile(optimizer='adam', 
              loss='binary_crossentropy' if len(np.unique(y)) == 2 else 'sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train_balanced, epochs=20, batch_size=32, validation_data=(X_test_scaled, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

# Predict on test set
y_pred_prob = model.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype("int32")

# Classification report
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 760us/step - accuracy: 0.8728 - loss: 0.3012 - val_accuracy: 0.6575 - val_loss: 0.7804
Epoch 2/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 697us/step - accuracy: 0.9088 - loss: 0.2166 - val_accuracy: 0.6525 - val_loss: 0.7854
Epoch 3/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 699us/step - accuracy: 0.9106 - loss: 0.2076 - val_accuracy: 0.6502 - val_loss: 0.8547
Epoch 4/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 675us/step - accuracy: 0.9110 - loss: 0.2077 - val_accuracy: 0.6359 - val_loss: 0.8990
Epoch 5/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 684us/step - accuracy: 0.9138 - loss: 0.2029 - val_accuracy: 0.6305 - val_loss: 0.8740
Epoch 6/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 682us/step - accuracy: 0.9125 - loss: 0.2029 - val_accuracy: 0.6335 - val_loss: 0.8927
Epoch 7/20
[1m

In [281]:
# Calculate metrics for Random Forest
rf_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'precision': precision_score(y_test, y_pred_rf),
    'recall': recall_score(y_test, y_pred_rf),
    'f1': f1_score(y_test, y_pred_rf),
    'auc_roc': roc_auc_score(y_test, y_pred_rf)
}

# Calculate metrics for Neural Network
nn_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_nn),
    'precision': precision_score(y_test, y_pred_nn),
    'recall': recall_score(y_test, y_pred_nn),
    'f1': f1_score(y_test, y_pred_nn),
    'auc_roc': roc_auc_score(y_test, y_pred_prob_nn)
}

# Compare Metrics in a DataFrame 
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC'],
    'Random Forest': [rf_metrics['accuracy'], rf_metrics['precision'], 
                      rf_metrics['recall'], rf_metrics['f1'], rf_metrics['auc_roc']],
    'Neural Network': [nn_metrics['accuracy'], nn_metrics['precision'], 
                       nn_metrics['recall'], nn_metrics['f1'], nn_metrics['auc_roc']]
})

# Print the comparison DataFrame without the index
print("\nModel Comparison:")
print(comparison_df.to_string(index=False))

# Determine the best model based on AUC-ROC 
if rf_metrics['auc_roc'] > nn_metrics['auc_roc']:
    print("\nRandom Forest performed better based on AUC-ROC score.")
else:
    print("\nNeural Network performed better based on AUC-ROC score.")



Model Comparison:
   Metric  Random Forest  Neural Network
 Accuracy       0.884190        0.661063
Precision       0.513514        0.246223
   Recall       0.508550        0.896654
 F1-Score       0.511020        0.386353
  AUC-ROC       0.721738        0.815858

Neural Network performed better based on AUC-ROC score.
