In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from ucimlrepo import fetch_ucirepo 


In [None]:
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 

# data (as pandas dataframes) 
x_bank = bank_marketing.data.features.copy() 
y_bank = bank_marketing.data.targets.copy() 

In [None]:
bank_data = pd.concat([x_bank, y_bank], axis=1)

Filling Missing

In [None]:
for column in ['job','contact','poutcome']:
    mode_value = bank_data[column].mode()[0]
    bank_data.loc[:, column] = bank_data[column].fillna(mode_value)

In [None]:
bank_data = pd.get_dummies(bank_data, columns=['job', 'marital','education','month','poutcome'], drop_first=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
binary_columns = ['default','housing','loan','contact','y']
label = LabelEncoder()
for col in binary_columns:
    bank_data[col] = label.fit_transform(bank_data[col])

In [None]:
X_bank = bank_data.drop('y', axis=1)
y_bank = bank_data['y']

In [None]:
X_bank_train, X_bank_test, y_bank_train, y_bank_test = train_test_split(X_bank, y_bank, test_size=0.20, random_state=43)

In [None]:
scaler = StandardScaler()

X_bank_train = scaler.fit_transform(X_bank_train)
X_bank_test = scaler.transform(X_bank_test)

In [None]:

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score

In [None]:
'''

sampling = SMOTE(random_state=43)
X_bank_train, y_bank_train = sampling.fit_resample(X_bank_train, y_bank_train)'''

In [None]:
def plot_learning_curve(estimator, X, y, train_sizes, cv=5, scoring='f1_macro'):
    train_sizes, train_scores, val_scores = learning_curve(estimator, X, y, train_sizes=train_sizes, cv=cv, scoring=scoring, n_jobs=-1)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(8, 6))
    plt.plot(train_sizes, train_scores_mean, label='Training score')
    plt.plot(train_sizes, val_scores_mean, label='Validation score')
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1)
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1)
    plt.xlabel('Training examples')
    plt.ylabel(scoring)
    plt.title(f'Learning Curve ({type(estimator).__name__})')
    plt.legend(loc='best')
    plt.grid(True, linestyle='--', linewidth=0.5)
    plt.show()

NN for Bank

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import validation_curve
from sklearn.neural_network import MLPClassifier

param_ranges = {
    "hidden_layer_sizes": [50, 100, 150, 200],
    "alpha": [0.0001, 0.001, 0.01, 0.1],
    "learning_rate_init": [0.001, 0.01, 0.1],
    "activation": ['identity', 'logistic', 'tanh', 'relu'],
    "max_iter": [500, 1000, 1500, 2000]
}

timing_results = {}
scores_results = {}

fig, axes = plt.subplots(1, 5, figsize=(20, 6))
axes = axes.flatten()

for idx, (param_name, param_range) in enumerate(param_ranges.items()):
    start_time = time.time()
    
    # Adjust parameter input format for hidden_layer_sizes
    if param_name == "hidden_layer_sizes":
        actual_param_range = [(size,) for size in param_range]
    else:
        actual_param_range = param_range

    train_scores, test_scores = validation_curve(
        MLPClassifier(random_state=43), X_bank_train, y_bank_train, 
        param_name=param_name, param_range=actual_param_range, cv=5, 
        scoring="f1_macro", n_jobs=-1
    )
    
    end_time = time.time()
    duration = end_time - start_time
    timing_results[param_name] = duration
    scores_results[param_name] = (train_scores, test_scores)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    ax = axes[idx]
    ax.plot(param_range, train_scores_mean, label="Training score", color="darkorange")
    ax.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange")
    ax.plot(param_range, test_scores_mean, label="Cross-validation score", color="navy")
    ax.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy")
    
    ax.set_title(f"Validation Curve - {param_name}")
    ax.set_xlabel(param_name)
    ax.set_ylabel("F1 Score")
    ax.legend(loc="best")
    ax.grid(True, linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()

print("Timing for each hyperparameter tuning:")
for param_name, duration in timing_results.items():
    print(f"{param_name}: {duration:.2f} seconds")

In [None]:
nn_bank = MLPClassifier(hidden_layer_sizes=(100,), activation='tanh' , alpha=0.001, learning_rate_init=0.001, max_iter=1000,random_state=43)
nn_scores = cross_val_score(nn_bank, X_bank_train, y_bank_train, cv=5, scoring='f1_macro', n_jobs=-1)
print(f'Neural Network Cross-Validation F1 Score (bank): {np.mean(nn_scores)}')

In [None]:
plot_learning_curve(nn_bank, X_bank_train, y_bank_train, train_sizes=np.linspace(0.1, 1.0, 10),cv=5)

KNN Bank

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import validation_curve
import numpy as np
import matplotlib.pyplot as plt
import time

param_range_n_neighbors = range(1, 21)
param_range_weights = ['uniform', 'distance']
param_range_metric = ['euclidean', 'manhattan', 'minkowski']

train_scores_dict = {}
test_scores_dict = {}
timing_results = {}

hyperparameters = ['n_neighbors', 'weights', 'metric']
param_ranges = [param_range_n_neighbors, param_range_weights, param_range_metric]


for hyperparam, param_range in zip(hyperparameters, param_ranges):
    start_time = time.time()
    
    if hyperparam in ['weights', 'metric']:
        classifier = KNeighborsClassifier(n_neighbors=10)
    else:
        classifier = KNeighborsClassifier()
        
    train_scores, test_scores = validation_curve(
        classifier, X_bank_train, y_bank_train, param_name=hyperparam,
        param_range=param_range, cv=5, scoring="f1_macro", n_jobs=-1
    )
    
    end_time = time.time()
    duration = end_time - start_time
    timing_results[hyperparam] = duration
    train_scores_dict[hyperparam] = train_scores
    test_scores_dict[hyperparam] = test_scores

fig, axes = plt.subplots(1, 3, figsize=(24, 6))
axes = axes.flatten()

for i, hyperparam in enumerate(hyperparameters):
    train_scores = train_scores_dict[hyperparam]
    test_scores = test_scores_dict[hyperparam]
    param_range = param_ranges[i]

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    ax = axes[i]
    ax.plot(param_range, train_scores_mean, label="Training score", color="darkorange", lw=2)
    ax.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange")
    ax.plot(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=2)
    ax.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy")
    
    ax.set_title(f"Validation Curve with KNN - {hyperparam}")
    ax.set_xlabel(hyperparam)
    ax.set_ylabel("F1 Score")  
    ax.legend(loc="best")
    ax.grid(True, linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()

print("Timing for each hyperparameter tuning:")
for hyperparam, duration in timing_results.items():
    print(f"{hyperparam}: {duration:.2f} seconds")


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_bank = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='manhattan')

knn_scores = cross_val_score(knn_bank, X_bank_train, y_bank_train, cv=5, scoring='f1_macro', n_jobs=-1)
print(f'KNN Cross-Validation F1 Score (bank): {np.mean(knn_scores)}')

In [None]:
plot_learning_curve(knn_bank, X_bank_train, y_bank_train, train_sizes=np.linspace(0.1, 1.0, 10),cv=5)

SVM Bank

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve
import numpy as np
import matplotlib.pyplot as plt

param_range_C = [0.001, 0.01, 0.1, 1, 10, 100]
param_range_gamma = [0.001, 0.01, 0.1, 1, 10, 100]
param_range_kernel = ['linear', 'rbf', 'poly', 'sigmoid']

train_scores_C, test_scores_C = validation_curve(
    SVC(kernel='rbf'), X_bank_train, y_bank_train, param_name="C",
    param_range=param_range_C, cv=5, scoring="f1_macro", n_jobs=-1)

train_scores_gamma, test_scores_gamma = validation_curve(
    SVC(kernel='rbf', C=1), X_bank_train, y_bank_train, param_name="gamma",
    param_range=param_range_gamma, cv=5, scoring="f1_macro", n_jobs=-1)

train_scores_kernel, test_scores_kernel = validation_curve(
    SVC(C=1), X_bank_train, y_bank_train, param_name="kernel",
    param_range=param_range_kernel, cv=5, scoring="f1_macro", n_jobs=-1)

fig, axes = plt.subplots(1, 3, figsize=(20, 6))
hyperparameters = ['C', 'gamma', 'kernel']
param_ranges = [param_range_C, param_range_gamma, param_range_kernel]
train_scores_list = [train_scores_C, train_scores_gamma, train_scores_kernel]
test_scores_list = [test_scores_C, test_scores_gamma, test_scores_kernel]

for i, (hyperparam, param_range, train_scores, test_scores, ax) in enumerate(zip(hyperparameters, param_ranges, train_scores_list, test_scores_list, axes)):
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    ax.set_title(f"Validation Curve with SVM - {hyperparam}")
    ax.set_xlabel(hyperparam)
    ax.set_ylabel("F1 Score")
    ax.set_ylim(0.0, 1.1)
    lw = 2
    if hyperparam in ['C', 'gamma']:
        ax.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw)
        ax.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw)
        ax.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw)
        ax.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw)
    else:
        ax.plot(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw)
        ax.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw)
        ax.plot(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw)
        ax.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw)
    ax.legend(loc="best")
    ax.grid(True, linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm_rbf_bank = SVC(kernel='rbf', C=0.1,gamma=0.01, random_state=43)
svm_scores = cross_val_score(svm_rbf_bank, X_bank_train, y_bank_train, cv=5, scoring='f1_macro')
print(f'SVM RBF Kernel Cross-Validation F1 Score (bank): {np.mean(svm_scores)}')

In [None]:
plot_learning_curve(svm_rbf_bank, X_bank_train, y_bank_train, train_sizes=np.linspace(0.1, 1.0, 10),)