Fetching Data

In [32]:
import pandas as pd
import numpy as np


X = pd.read_csv("bank-additional.csv", delimiter=';')
X.head()
X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             4119 non-null   int64  
 1   job             4119 non-null   object 
 2   marital         4119 non-null   object 
 3   education       4119 non-null   object 
 4   default         4119 non-null   object 
 5   housing         4119 non-null   object 
 6   loan            4119 non-null   object 
 7   contact         4119 non-null   object 
 8   month           4119 non-null   object 
 9   day_of_week     4119 non-null   object 
 10  duration        4119 non-null   int64  
 11  campaign        4119 non-null   int64  
 12  pdays           4119 non-null   int64  
 13  previous        4119 non-null   int64  
 14  poutcome        4119 non-null   object 
 15  emp.var.rate    4119 non-null   float64
 16  cons.price.idx  4119 non-null   float64
 17  cons.conf.idx   4119 non-null   f

Data Cleaning

In [2]:

# UCIMLRepo says to remove this, since the outcome of this data is not known before the call, so its useless for prediction
X = X.drop('duration', axis=1)
#Feature Engineering to remove the 999 values in pdays, and create a new binary class instead( "contacted")
X['contacted'] = X['pdays'].apply(lambda x: 0 if x == 999 else 1)
X=X.drop('pdays',axis=1)

Preprocessing Steps

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Segregating the target variable 'y' and features
target_var = X[['y']]
features = X.drop(columns=['y'])

# Partitioning data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_var, test_size=0.1, random_state=42)

# Distinguishing between numeric and categorical columns
numeric_cols = features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = features.select_dtypes(include=['object']).columns
print(numeric_cols)
print(categorical_cols)

print("List of Numeric Columns:", numeric_cols)
print("List of Categorical Columns:", categorical_cols)

# Setting up preprocessing for numeric data
numeric_prep = Pipeline([
    ('median_imputer', SimpleImputer(strategy='median')),
    ('standard_scaler', StandardScaler())])

# Setting up preprocessing for categorical data
categorical_prep = Pipeline([
    ('fill_missing', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))])

# Combining preprocessing steps
data_preprocessor = ColumnTransformer([
    ('numeric', numeric_prep, numeric_cols),
    ('categorical', categorical_prep, categorical_cols)])
print("\nData Preprocessor Pipeline:")
print(data_preprocessor)



Index(['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'contacted'],
      dtype='object')
Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')
List of Numeric Columns: Index(['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'contacted'],
      dtype='object')
List of Categorical Columns: Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')

Data Preprocessor Pipeline:
ColumnTransformer(transformers=[('numeric',
                                 Pipeline(steps=[('median_imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standard_scaler',
                                         

Model Selection and Evaluation

In [27]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from sklearn.preprocessing import LabelBinarizer

# Defining a list of classifier names and corresponding classifier objects
classifier_names = ["Logistic Regression", "Gradient Boosting", "AdaBoost", "SVM"]
classifiers_list = [
    LogisticRegression(random_state=42),
    GradientBoostingClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    SVC(probability=True, random_state=42)
]

# Iterating over each classifier for training and evaluation
for name, clf in zip(classifier_names, classifiers_list):
    # Building a pipeline with preprocessing and classification
    pipeline = Pipeline([('data_prep', data_preprocessor),
                         ('clf', clf)])
    # Fitting the model on training data
    pipeline.fit(X_train, y_train.values.ravel())

    # Making predictions and evaluating the model
    predicted_probs = pipeline.predict_proba(X_test)[:, 1]
    predictions = pipeline.predict(X_test)

    # Converting string labels to binary for ROC AUC and Precision Score computation
    label_binarizer = LabelBinarizer()
    binary_test_labels = label_binarizer.fit_transform(y_test)

    # Displaying the classification report
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, predictions))

    # Calculating and displaying ROC AUC and Precision Scores
    roc_score = roc_auc_score(binary_test_labels, predicted_probs)
    print(f"ROC AUC Score for {name}: {roc_score}")
    precision_score = average_precision_score(binary_test_labels, predicted_probs)
    print(f"Precision Score for {name}: {precision_score}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report for Logistic Regression:
              precision    recall  f1-score   support

          no       0.91      0.99      0.95       370
         yes       0.62      0.19      0.29        42

    accuracy                           0.91       412
   macro avg       0.77      0.59      0.62       412
weighted avg       0.88      0.91      0.88       412

ROC AUC Score for Logistic Regression: 0.7138996138996139
Precision Score for Logistic Regression: 0.3713615242754769
Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

          no       0.92      0.99      0.95       370
         yes       0.64      0.21      0.32        42

    accuracy                           0.91       412
   macro avg       0.78      0.60      0.64       412
weighted avg       0.89      0.91      0.89       412

ROC AUC Score for Gradient Boosting: 0.7212033462033463
Precision Score for Gradient Boosting: 0.3799621620537865
Classification Report 

Hyperparameter Tuning (for AdaBoost)

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

# Defining the model pipeline
model_pipeline = Pipeline([
    ('data_preprocessing', preprocessor),
    ('ada_classifier', AdaBoostClassifier(random_state=42))
])

# Parameter grid for RandomizedSearchCV
param_distributions = {
    'ada_classifier__n_estimators': np.arange(50, 301, 50),
    'ada_classifier__learning_rate': np.logspace(-2, 0, 5)
}

# Using RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(model_pipeline, param_distributions, n_iter=10, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Extracting the best parameters and the best model
best_parameters = random_search.best_params_
print("Best Hyperparameters:", best_parameters)

best_model = random_search.best_estimator_

# Making predictions with the best model
y_predictions = best_model.predict(X_test)
y_predicted_probs = best_model.predict_proba(X_test)[:, 1]

# Binarizing the y_test for ROC AUC and Average Precision calculations
label_binarizer = LabelBinarizer()
binary_test_labels = label_binarizer.fit_transform(y_test)

# Generating a confusion matrix and accuracy score
conf_matrix = confusion_matrix(y_test, y_predictions)
accuracy = accuracy_score(y_test, y_predictions)
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy Score:", accuracy)

# Calculating and displaying ROC AUC and Average Precision Scores
roc_auc_score = roc_auc_score(binary_test_labels, y_predicted_probs)
average_precision_score = average_precision_score(binary_test_labels, y_predicted_probs)
print(f"ROC AUC Score for Best Model: {roc_auc_score}")
print(f"Average Precision Score for Best Model: {average_precision_score}")


  y = column_or_1d(y, warn=True)


Best Hyperparameters: {'ada_classifier__n_estimators': 50, 'ada_classifier__learning_rate': 0.1}
Confusion Matrix:
 [[367   3]
 [ 36   6]]
Accuracy Score: 0.9053398058252428
ROC AUC Score for Best Model: 0.739060489060489
Average Precision Score for Best Model: 0.413016186326996
