#### COGS 118 Project - Bank Marketing

In [5]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler


In [6]:
Bank_Marketing = pd.read_csv('bank-full.csv', sep=";")

### Data Exploration

In [7]:
Bank_Marketing

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [8]:
# Check Null Values
print(Bank_Marketing.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [9]:
# Check data types
print(Bank_Marketing.dtypes)

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object


## Convert our Output Variable to Binary

In [10]:
# Convert target variable to binary
Bank_Marketing['y'] = Bank_Marketing['y'].map({'yes': 1, 'no': 0})

In [11]:
Bank_Marketing['y']

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

## Select Relavant columns

In [12]:
Bank_Marketing.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [13]:
columns = ['age','housing','education','balance','loan','duration','y']
Bank = Bank_Marketing[columns]

In [14]:
Bank['housing'] = Bank['housing'].map({'yes': 1, 'no': 0, 'unknown': np.nan})
Bank['loan'] = Bank['loan'].map({'yes': 1, 'no': 0, 'unknown': np.nan})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bank['housing'] = Bank['housing'].map({'yes': 1, 'no': 0, 'unknown': np.nan})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bank['loan'] = Bank['loan'].map({'yes': 1, 'no': 0, 'unknown': np.nan})


In [15]:
Bank = Bank.replace('unknown',np.nan)
Bank = Bank.replace('primary',0)
Bank = Bank.replace('secondary',0.5)
Bank = Bank.replace('tertiary',1)


  Bank = Bank.replace('tertiary',1)


In [16]:
from sklearn.preprocessing import StandardScaler

# Columns to scale
columns_to_scale = ['age', 'balance', 'duration']

# Initialize scaler
scaler = StandardScaler()

# Scale selected columns
Bank[columns_to_scale] = scaler.fit_transform(Bank[columns_to_scale])


In [17]:
Bank

Unnamed: 0,age,housing,education,balance,loan,duration,y
0,1.606965,1.0,1.0,0.256419,0.0,0.011016,0
1,0.288529,1.0,0.5,-0.437895,0.0,-0.416127,0
2,-0.747384,1.0,0.5,-0.446762,1.0,-0.707361,0
3,0.571051,1.0,,0.047205,0.0,-0.645231,0
4,-0.747384,0.0,,-0.447091,0.0,-0.233620,0
...,...,...,...,...,...,...,...
45206,0.947747,0.0,1.0,-0.176460,0.0,2.791329,1
45207,2.831227,0.0,0.0,0.120447,0.0,0.768224,1
45208,2.925401,0.0,0.5,1.429593,0.0,3.373797,1
45209,1.512791,0.0,0.5,-0.228024,0.0,0.970146,0


In [18]:
rows_b = Bank.shape[0]
Bank = Bank.dropna()
rows_a = Bank.shape[0]
print(rows_b,rows_a)

45211 43354


In [19]:
Bank.value_counts('y')

y
0    38317
1     5037
Name: count, dtype: int64

In [20]:
columns = 0.2 * rows_a
trial = Bank.sample(n=10000, random_state=42)
trial.value_counts('y')

y
0    8832
1    1168
Name: count, dtype: int64

## Random Forest

In [21]:
!pip install imbalanced-learn




In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
X = Bank.drop("y", axis=1)  # Features
y = Bank["y"]               # Target

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Oversample the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42)
classifier.fit(X_resampled, y_resampled)

# Make predictions
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)




Accuracy: 83.61%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.87      0.90      7638
           1       0.38      0.57      0.45      1033

    accuracy                           0.84      8671
   macro avg       0.66      0.72      0.68      8671
weighted avg       0.87      0.84      0.85      8671



## SVM

In [23]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

df = trial

# Separate features and target
X = df.drop("y", axis=1)  # Features
y = df["y"]               # Target

# Perform 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM classifier
classifier = svm.SVC(kernel='linear', class_weight='balanced', random_state=42)
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)


Accuracy: 79.10%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.80      0.87      1748
           1       0.35      0.75      0.47       252

    accuracy                           0.79      2000
   macro avg       0.65      0.77      0.67      2000
weighted avg       0.88      0.79      0.82      2000



## ANN

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

# Define the ANN
class ANN(nn.Module):
    def __init__(self):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = ANN()

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
for epoch in range(100):  # Adjust the number of epochs as needed
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}")

# Evaluate the model
with torch.no_grad():
    y_pred = model(X_test).argmax(dim=1)
    accuracy = (y_pred == y_test).float().mean()
    print(f"Accuracy: {accuracy:.4f}")


  y_train = torch.tensor(y_train, dtype=torch.long)
  y_test = torch.tensor(y_test, dtype=torch.long)


Epoch [10/100], Loss: 0.5147
Epoch [20/100], Loss: 0.4287
Epoch [30/100], Loss: 0.3685
Epoch [40/100], Loss: 0.3262
Epoch [50/100], Loss: 0.2992
Epoch [60/100], Loss: 0.2851
Epoch [70/100], Loss: 0.2776
Epoch [80/100], Loss: 0.2727
Epoch [90/100], Loss: 0.2688
Epoch [100/100], Loss: 0.2656
Accuracy: 0.8775


In [26]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

# Evaluate the model
y_pred = mlp.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      0.97      0.93      1748
           1       0.49      0.23      0.31       252

    accuracy                           0.87      2000
   macro avg       0.69      0.60      0.62      2000
weighted avg       0.85      0.87      0.85      2000



## XG Boost

In [30]:
!pip install xgboost




In [34]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Set the scale_pos_weight to handle class imbalance
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

# Initialize and train the model
xgb = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    n_estimators=300,        # Number of trees
    max_depth=6,             # Depth of each tree
    learning_rate=0.1,       # Step size shrinkage
    random_state=42,
    use_label_encoder=False, # Suppress warnings for label encoding
    eval_metric='logloss'    # Evaluation metric
)
xgb.fit(X_train, y_train)

# Make predictions
y_pred = xgb.predict(X_test)
y_proba = xgb.predict_proba(X_test)[:, 1]  # For ROC-AUC

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# ROC-AUC score
roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC-AUC Score: {roc_auc:.2f}")



Parameters: { "use_label_encoder" } are not used.



Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.87      0.90      1748
           1       0.38      0.54      0.45       252

    accuracy                           0.83      2000
   macro avg       0.66      0.71      0.67      2000
weighted avg       0.86      0.83      0.84      2000

ROC-AUC Score: 0.83
