In [14]:
!wget -q https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip
!unzip -o -q bank.zip


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


In [16]:
# The extracted CSV is usually named "bank-ful.csv"
df = pd.read_csv("bank-full.csv", sep=";")


In [17]:
#  EXPLORE THE DATA (Optional Quick Glance)
print("Data shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nSample rows:")
print(df.head(3))
print("\nData Info:")
df.info()


Data shape: (45211, 17)
Columns: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']

Sample rows:
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     -------------- 

In [18]:
# Identify categorical vs numerical columns
categorical_cols = [col for col in df.columns if df[col].dtype == 'object' and col != 'y']
numerical_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]


In [19]:

print("\nCategorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

# Convert target column ('y') into binary (0/1)
#      'yes' -> 1, 'no' -> 0
df['y'] = df['y'].map({'yes': 1, 'no': 0})

#Encode categorical features using get_dummies (one-hot encoding)
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

#Separate features (X) and target (y)
X = df_encoded.drop('y', axis=1)
y = df_encoded['y']

# (Optional) Scale numerical features
#     Usually helps for Neural Networks
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



Categorical columns: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numerical columns: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTrain set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Train set shape: (36168, 42)
Test set shape: (9043, 42)


In [24]:

# MODEL 1 - RANDOM FOREST CLASSIFIER
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
rf_model.fit(X_train, y_train)
rf_train_predictions = rf_model.predict(X_train)
rf_predictions = rf_model.predict(X_test)

# Evaluate Random Forest
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("\n=========================")
print("RANDOM FOREST RESULTS")
print("=========================")
print("Test accuracy:", rf_accuracy)
print("Train accuracy", accuracy_score(y_train, rf_train_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))



RANDOM FOREST RESULTS
Test accuracy: 0.9051199823067566
Train accuracy 1.0
Confusion Matrix:
 [[7774  211]
 [ 647  411]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.66      0.39      0.49      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.68      0.72      9043
weighted avg       0.89      0.91      0.89      9043



In [25]:

# MODEL 2 - NEURAL NETWORK (MLPClassifier)

# A simple feed-forward MLP with one hidden layer
mlp_model = MLPClassifier(
    hidden_layer_sizes=(64,),
    max_iter=300,
    activation='relu',
    solver='adam',
    random_state=42
)
mlp_model.fit(X_train, y_train)

mlp_predictions = mlp_model.predict(X_test)

# Evaluate MLP
mlp_accuracy = accuracy_score(y_test, mlp_predictions)
print("\n=========================")
print("NEURAL NETWORK (MLP) RESULTS")
print("=========================")
print("Accuracy:", mlp_accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, mlp_predictions))
print("Classification Report:\n", classification_report(y_test, mlp_predictions))



NEURAL NETWORK (MLP) RESULTS
Accuracy: 0.8991485126617274
Confusion Matrix:
 [[7636  349]
 [ 563  495]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      7985
           1       0.59      0.47      0.52      1058

    accuracy                           0.90      9043
   macro avg       0.76      0.71      0.73      9043
weighted avg       0.89      0.90      0.89      9043



In [26]:
# COMPARE RESULTS
# -------------------------------------------------------
print("\n================================")
print("COMPARISON OF MODEL ACCURACIES")
print("================================")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Neural Network Accuracy: {mlp_accuracy:.4f}")


COMPARISON OF MODEL ACCURACIES
Random Forest Accuracy: 0.9051
Neural Network Accuracy: 0.8991
