In [252]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from imblearn.over_sampling import SMOTE

In [229]:
data = pd.read_csv('bank_data.csv')

# convert to binary values of housing, loan, default
data['housing'] = data['housing'].map({'yes' : 1, 'no' : 0})
data['default'] = data['default'].map({'yes': 1, 'no' :0})
data['loan'] = data['loan'].map({'yes' :1, 'no' :0})


# encode martial type
data['marital_single'] = data['marital'].apply(lambda x:1 if 'single' in x else 0)
data['marital_married'] = data['marital'].apply(lambda x:1 if 'married' in x else 0)
data['marital_divorced'] = data['marital'].apply(lambda x:1 if 'divorced' in x else 0)

# encode education type
data['education_primary'] = data['education'].apply(lambda x:1 if 'primary' in x else 0 )
data['education_secondary'] = data['education'].apply(lambda x:1 if 'secondary' in x else 0 )
data['education_tertiary'] = data['education'].apply(lambda x:1 if 'tertiary' in x else 0 )
data['education_others'] = data['education'].apply(lambda x:1 if 'others' in x else 0 )

# encode contact type
data['contact_cellular'] = data['contact'].apply(lambda x:1 if 'cellular' in x else 0)
data['contact_telephone'] = data['contact'].apply(lambda x:1 if 'telephone' in x else 0)
data['contact_unknown'] = data['contact'].apply(lambda x:1 if 'unknown' in x else 0)

# one hot encode of job type
def job_type(input):
    if input == 'self-employed' or input == 'entrepreneur' or input == 'unemployed' or input == 'housemaid' or input == 'student' or input == 'unknown' :
        return 'other_job'
    else:
        return input
data['job'] = data['job'].apply(job_type)

data['blue-collar'] = data['job'].apply(lambda x:1 if 'blue-collar' in x else 0)
data['management'] = data['job'].apply(lambda x:1 if 'management' in x else 0)
data['technician'] = data['job'].apply(lambda x:1 if 'technician' in x else 0)
data['admin'] = data['job'].apply(lambda x:1 if 'admin' in x else 0)
data['services'] = data['job'].apply(lambda x:1 if 'services' in x else 0)
data['retired'] = data['job'].apply(lambda x:1 if 'retired' in x else 0)
data['other_job'] = data['job'].apply(lambda x:1 if 'other_job' in x else 0)

# one hot encoding of poutcome
data['poutcome_unknown'] = data['poutcome'].apply(lambda x:1 if 'unknown' in x else 0)
data['poutcome_failure'] = data['poutcome'].apply(lambda x:1 if 'failure' in x else 0)
data['poutcome_other'] = data['poutcome'].apply(lambda x:1 if 'other' in x else 0)
data['poutcome_success'] = data['poutcome'].apply(lambda x:1 if 'success' in x else 0)

# one hot encoding of y values
data['y'] = data['y'].map({'yes' :1, 'no' :0})

# one hot encoding of months
data['jan_month'] = data['month'].apply(lambda x:1 if 'jan' in x else 0)
data['feb_month'] = data['month'].apply(lambda x:1 if 'feb' in x else 0)
data['mar_month'] = data['month'].apply(lambda x:1 if 'mar' in x else 0)
data['apr_month'] = data['month'].apply(lambda x:1 if 'apr' in x else 0)
data['may_month'] = data['month'].apply(lambda x:1 if 'may' in x else 0)
data['jun_month'] = data['month'].apply(lambda x:1 if 'jun' in x else 0)
data['jul_month'] = data['month'].apply(lambda x:1 if 'jul' in x else 0)
data['aug_month'] = data['month'].apply(lambda x:1 if 'aug' in x else 0)
data['sep_month'] = data['month'].apply(lambda x:1 if 'sep' in x else 0)
data['oct_month'] = data['month'].apply(lambda x:1 if 'oct' in x else 0)
data['nov_month'] = data['month'].apply(lambda x:1 if 'nov' in x else 0)
data['dec_month'] = data['month'].apply(lambda x:1 if 'dec' in x else 0)


# remove columns
data = data.drop(columns=['education', 'marital', 'contact', 'job', 'month', 'poutcome'])

# one hot encoding of data set
data = pd.get_dummies(data)




In [253]:
# split the dataset into features and target
X = data.drop(columns=['y'])
y = data['y']

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40 )

# apply the SMOTE to balance  the target variable
smote = SMOTE(random_state=40)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Random Forest model
rf = RandomForestClassifier(random_state=42)

# Hyperparameter grid
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Grid Search
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameters and evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy: 0.9244259778400746


In [254]:
X = data.drop(columns=['y'])
y = data['y']

# select best features
feature_selection = SelectKBest(score_func = mutual_info_classif, k = 30)
X_selected = feature_selection.fit_transform(X, y)

# split data set
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.25, random_state=40)

# apply the smote to baance target variable
smote = SMOTE(random_state=40)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.fit_transform(X_test)

# Build the neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1 if len(np.unique(y)) == 2 else len(np.unique(y)), activation='sigmoid' if len(np.unique(y)) == 2 else 'softmax')
])

# Compile the model
model.compile(optimizer='adam', 
              loss='binary_crossentropy' if len(np.unique(y)) == 2 else 'sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train_balanced, epochs=20, batch_size=32, validation_data=(X_test_scaled, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 801us/step - accuracy: 0.8798 - loss: 0.2894 - val_accuracy: 0.6272 - val_loss: 0.9125
Epoch 2/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 706us/step - accuracy: 0.9123 - loss: 0.2044 - val_accuracy: 0.6919 - val_loss: 0.7193
Epoch 3/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 682us/step - accuracy: 0.9151 - loss: 0.1970 - val_accuracy: 0.6634 - val_loss: 0.8398
Epoch 4/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 646us/step - accuracy: 0.9189 - loss: 0.1908 - val_accuracy: 0.6704 - val_loss: 0.7706
Epoch 5/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 750us/step - accuracy: 0.9208 - loss: 0.1852 - val_accuracy: 0.6475 - val_loss: 0.9023
Epoch 6/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 615us/step - accuracy: 0.9184 - loss: 0.1900 - val_accuracy: 0.6265 - val_loss: 1.0147
Epoch 7/20
[1m

In [247]:
y_pred = model.predict(X_test)

# Print the predictions
print("Predicted y values:")
print(y_pred)

[1m354/354[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388us/step
Predicted y values:
[[1.        ]
 [1.        ]
 [1.        ]
 ...
 [0.20046031]
 [0.9922421 ]
 [1.        ]]


In [248]:
y_test.head()

42663    0
812      0
11294    0
45117    0
19520    0
Name: y, dtype: int64

In [249]:
y.value_counts()

y
0    39922
1     5289
Name: count, dtype: int64