Steps for Data Preprocessing and Model Building

1. **Read CSV Data**
   - Load the dataset from a CSV file.

2. **Split Train and Test**
   - Split the dataset into training and testing sets.

3. **Identify Numeric and Categorical Features**
   - Separate the features into numeric and categorical types.

4. **Binary Encode Selected Features**
   - Choose specific features (from a provided list) to apply binary encoding.

5. **Handle Missing Values**
   - Use `SimpleImputer` with the "most frequent" strategy to impute missing values for the columns in the provided list.

6. **Correlation Analysis**
   - Perform correlation analysis on a given list of numeric columns.

7. **Drop Features**
   - Remove features based on the results of the correlation analysis.

8. **Encode Categorical Features**
   - Encode the categorical features using a provided feature-to-encoding-type mapping.

9. **Normalize Numerical Features**
   - Normalize numerical features according to the given mapping of features and normalization techniques.

10. **Apply PCA**
    - Perform Principal Component Analysis (PCA) for dimensionality reduction.

11. **SVM and Logistic Regression Classifiers**
    - Train both Support Vector Machine (SVM) and Logistic Regression models.

12. **Hyperparameter Tuning**
    - Use Grid Search and Random Search to optimize hyperparameters for both models.

13. **Select Best Model**
    - Select the best model based on validation results.

14. **Evaluate on Train Data**
    - Evaluate the selected model's performance on the training dataset.


# Data Import and Preprocessing

In [675]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
# Step 1: Load the dataset
data = pd.read_csv("bank-additional/bank-additional-full.csv",sep=';')  # Replace with your dataset path
# Separate features and target
X = data.drop('y', axis=1)
y = data['y']

# Step 2: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify categorical and numerical columns
categorical_cols = list(X.select_dtypes(include=['object']).columns)
numerical_cols = list(X.select_dtypes(include=['int64', 'float64']).columns)

# Mapping recommended normalization techniques to features
normalization_map = {
    'age': 'Standardization (Z-Score)',
    'duration': 'Log Transformation',
    'campaign': 'Log Transformation',
    'pdays': 'Log Transformation',
    'previous': 'Log Transformation',
    'emp.var.rate': 'Standardization (Z-Score)',
    'cons.price.idx': 'Standardization (Z-Score)',
    'cons.conf.idx': 'Standardization (Z-Score)',
}
class PreprocessTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, binary_columns, columns_with_unknowns, education_order, one_hot_cols, numeric_cols, threshold=0.85,impute=True, impute_strategy='most_frequent'):
        self.binary_columns = binary_columns
        self.columns_with_unknowns = columns_with_unknowns

        self.one_hot_cols = one_hot_cols
        self.numeric_cols = numeric_cols
        self.threshold = threshold
        self.impute = impute
        self.impute_strategy = impute_strategy
        
        if not self.impute:
            self.education_order = education_order + ['unknown']
        else:
            self.education_order = education_order
            self.simple_imputer = SimpleImputer(strategy=self.impute_strategy) # Impute missing education values
        
        self.education_encoder = OrdinalEncoder(categories=[education_order])
        self.features_to_drop_ = []  # For correlation reduction

    def fit(self, X, y=None):
        X = X.copy()

        if self.impute:
            # Replace 'unknown' with NaN
            for col in self.columns_with_unknowns:
                X[col] = X[col].replace('unknown', np.nan)

            # Fit the imputer for columns with unknowns
            self.simple_imputer.fit(X[self.columns_with_unknowns])
            X[self.columns_with_unknowns] = self.simple_imputer.transform(X[self.columns_with_unknowns])

        # Fit the correlation reducer
        correlation_matrix = X[self.numeric_cols].corr()
        high_corr_pairs = correlation_matrix.where(
            (correlation_matrix.abs() > self.threshold) & (correlation_matrix != 1.0)
        )
        for column in high_corr_pairs.columns:
            highly_corr = high_corr_pairs[column].dropna().index.tolist()
            if highly_corr:
                if column not in self.features_to_drop_:
                    self.features_to_drop_.extend(highly_corr)
        self.features_to_drop_ = list(set(self.features_to_drop_))

        # Fit the education encoder (after imputing missing values)
        self.education_encoder.fit(X[['education']])

        return self

    def transform(self, X):
        X = X.copy()

        if self.impute:
            # Replace 'unknown' with NaN
            for col in self.columns_with_unknowns:
                X[col] = X[col].replace('unknown', np.nan)

        # Binary Encoding
        for column in self.binary_columns:
            X[column] = X[column].apply(lambda x: 1 if x == "yes" else 0)

        # Process 'pdays' and create 'previously_contacted'
        X['previously_contacted'] = (X['pdays'] != 999).astype(int)
        # X['pdays'] = X['pdays'].replace(999, 0)

        # Reduce Correlation
        X = X.drop(columns=self.features_to_drop_, errors='ignore')

        # Impute missing values
        X[self.columns_with_unknowns] = self.simple_imputer.transform(X[self.columns_with_unknowns])

        # Apply ordinal encoding to 'education'
        X['education'] = self.education_encoder.transform(X[['education']]).flatten()

        # One-Hot Encoding for categorical columns
        X = pd.get_dummies(X, columns=self.one_hot_cols, drop_first=True, dtype=int)

        return X


class NormalizationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, normalization_map):
        self.normalization_map = normalization_map

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for feature, technique in self.normalization_map.items():
            if technique == 'Standardization (Z-Score)':
                X[feature] = StandardScaler().fit_transform(X[[feature]])
            elif technique == 'Log Transformation':
                X[feature] = np.log1p(X[feature])
                X[feature] = StandardScaler().fit_transform(X[[feature]])

        return X


In [676]:
# Initialize with required column information
combined_transformer = PreprocessTransformer(
    binary_columns=['default', 'housing', 'loan'],
    columns_with_unknowns=['job', 'marital', 'education', 'contact', 'poutcome'],
    education_order=['illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 
                     'professional.course', 'university.degree'],
    one_hot_cols=['job', 'marital', 'contact', 'poutcome','month','day_of_week'],
    numeric_cols=['age', 'duration', 'campaign', 'pdays', 'previous', 
                  'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'nr.employed', 'euribor3m','pdays'],
    threshold=0.85,  # Correlation threshold
    impute=True,
    impute_strategy='most_frequent'
)

# Initialize Normalization Transformer
normalization_transformer = NormalizationTransformer(normalization_map)
preprocessor = Pipeline(steps=[
    ('combined_transformer', combined_transformer),
    ('normalization_transformer', normalization_transformer)
]
)

# Feature engineering pipeline
feature_engineering = Pipeline(steps=[
    ('pca', PCA(n_components=0.95)),  # Retain 90% variance
])

# Combine preprocessing and feature engineering
full_preprocessor = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_engineering', feature_engineering)
])

X_transformed = full_preprocessor.fit_transform(X_train)

X_transformed = pd.DataFrame(X_transformed)
X_transformed.shape
## Preprocessing completed

(32950, 19)

# Model selection

In [677]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from scipy.stats import uniform

scoring_metric = 'f1'

# Step 6: Define preprocessing and classifier pipelines
svm_pipeline = Pipeline(steps=[
    ('preprocessing', full_preprocessor),
    ('classifier', SVC(random_state=42, class_weight='balanced'))
])

logreg_pipeline = Pipeline(steps=[
    ('preprocessing', full_preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Step 7: Define hyperparameter grids for SVM and Logistic Regression
# Define hyperparameter grids for RandomizedSearchCV
svm_param_grid = {
    'classifier__C': uniform(0.1, 10),
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto']
}

logreg_param_grid = {
    'classifier__C': uniform(0.1, 10),
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs', 'saga']
}

# Step 8: Use RandomizedSearchCV for hyperparameter tuning
random_search_svm = RandomizedSearchCV(
    estimator=svm_pipeline,
    param_distributions=svm_param_grid,
    scoring=scoring_metric,
    n_iter=10,
    cv=5,
    n_jobs=-1,
    random_state=42
)

random_search_logreg = RandomizedSearchCV(
    estimator=logreg_pipeline,
    param_distributions=logreg_param_grid,
    scoring=scoring_metric,
    n_iter=10,
    cv=5,
    n_jobs=-1,
    random_state=42
)


In [678]:
# binary encoding the target variable
y_train = y_train.apply(lambda x: 1 if x == "yes" else 0)

y_test = y_test.apply(lambda x: 1 if x == "yes" else 0)

In [679]:
# Step 9: Fit both models
print("Fitting SVM...")
random_search_svm.fit(X_train, y_train)
print("Fitting Logistic Regression...")
random_search_logreg.fit(X_train, y_train)
# Best models and hyperparameters
print("\nBest SVM parameters:", random_search_svm.best_params_)
print(f"Best SVM cross-validation {scoring_metric}:", random_search_svm.best_score_)

print("\nBest Logistic Regression parameters:", random_search_logreg.best_params_)
print(f"Best Logistic Regression cross-validation {scoring_metric}:", random_search_logreg.best_score_)

Fitting SVM...
Fitting Logistic Regression...

Best SVM parameters: {'classifier__C': np.float64(8.424426408004217), 'classifier__gamma': 'auto', 'classifier__kernel': 'rbf'}
Best SVM cross-validation f1: 0.5859884898483682

Best Logistic Regression parameters: {'classifier__C': np.float64(3.4370861113902182), 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
Best Logistic Regression cross-validation f1: 0.4953207646190755


### Selecting the best model

In [680]:
# Step 10: Compare models on the test set
svm_best_model = random_search_svm.best_estimator_
logreg_best_model = random_search_logreg.best_estimator_


### Train set evaluation

In [681]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Training Data Evaluation
print("*"*50, "SVM", "*"*50)
y_train_pred = svm_best_model.predict(X_train)
print("Training Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Classification Report:")
print(classification_report(y_train, y_train_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))

# Test Data Evaluation for logistic regrission
print("*"*50, "Logistic Regression", "*"*50)
y_train_pred = logreg_best_model.predict(X_train)
print("Training Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Classification Report:")
print(classification_report(y_train, y_train_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))



************************************************** SVM **************************************************
Training Data Evaluation:
Accuracy: 0.8739301972685888
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.86      0.92     29238
           1       0.47      0.96      0.63      3712

    accuracy                           0.87     32950
   macro avg       0.73      0.91      0.78     32950
weighted avg       0.94      0.87      0.89     32950

Confusion Matrix:
[[25224  4014]
 [  140  3572]]
************************************************** Logistic Regression **************************************************
Training Data Evaluation:
Accuracy: 0.9081638846737481
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     29238
           1       0.65      0.40      0.50      3712

    accuracy                           0.91     32950
   macro avg       0.7

### Test set evaluation

In [682]:
# Predictions
y_test_pred_svm = svm_best_model.predict(X_test)
y_test_pred_logreg = logreg_best_model.predict(X_test)

# Evaluation
print("\nSVM Test Set Evaluation:")
print(classification_report(y_test, y_test_pred_svm))

print("\nLogistic Regression Test Set Evaluation:")
print(classification_report(y_test, y_test_pred_logreg))



SVM Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.99      0.85      0.91      7310
           1       0.44      0.91      0.59       928

    accuracy                           0.86      8238
   macro avg       0.71      0.88      0.75      8238
weighted avg       0.92      0.86      0.88      8238


Logistic Regression Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7310
           1       0.68      0.41      0.51       928

    accuracy                           0.91      8238
   macro avg       0.80      0.69      0.73      8238
weighted avg       0.90      0.91      0.90      8238



In [683]:
# Step 11: Choose the best model based on test set accuracy
svm_test_accuracy = random_search_svm.score(X_test, y_test)
logreg_test_accuracy = random_search_logreg.score(X_test, y_test)

print(f"\nTest Set {scoring_metric}:")
print("SVM:", svm_test_accuracy)
print("Logistic Regression:", logreg_test_accuracy)

if svm_test_accuracy > logreg_test_accuracy:
    print("\nSVM is the best model on the test set.")
    best_model = svm_best_model
else:
    print("\nLogistic Regression is the best model on the test set.")
    best_model = logreg_best_model



Test Set f1:
SVM: 0.5901754385964912
Logistic Regression: 0.5147058823529411

SVM is the best model on the test set.


## Random forest

In [684]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

rf_pipeline = Pipeline(steps=[
    ('preprocessing', full_preprocessor),  # Use the same preprocessing pipeline
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=100))
])

rf_param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Random Forest
rf_random_search = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=rf_param_grid,
    scoring='f1',
    cv=5,
    n_iter=20,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
rf_random_search.fit(X_train, y_train)


# Best Random Forest
rf_best = rf_random_search.best_estimator_
rf_train_preds = rf_best.predict(X_train)
rf_test_preds = rf_best.predict(X_test)

print("Random Forest Training Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, rf_train_preds))
print("Classification Report:")
print(classification_report(y_train, rf_train_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_train, rf_train_preds))

print("\nRandom Forest Test Data Evaluation:")
print("Accuracy:", accuracy_score(y_test, rf_test_preds))
print("Classification Report:")
print(classification_report(y_test, rf_test_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_test_preds))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=50; total time=   3.7s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=50; total time=   3.8s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=50; total time=   3.8s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=50; total time=   3.8s
[CV] END classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=5, classifier__n_estimators=50; total time=   3.8s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=50; total time=   4.7s
[CV] END classifier__max_depth=None, classifier_

## Ensemble

In [685]:
svm_prob_pipeline = Pipeline(steps=[
    ('preprocessing', full_preprocessor),
    ('classifier', SVC(random_state=42, class_weight='balanced', probability=True))
])
from sklearn.ensemble import VotingClassifier

# Combine all four pipelines in a VotingClassifier
voting_ensemble = VotingClassifier(
    estimators=[
        ('svm', svm_prob_pipeline),
        ('logreg', logreg_pipeline),
        ('random_forest', rf_pipeline)
    ],
    voting='soft',  # Use soft voting to combine probabilities
    n_jobs=-1
)

In [686]:
voting_ensemble.fit(X_train, y_train)

In [687]:
# Evaluate on training data
y_train_pred = voting_ensemble.predict(X_train)
print("Training Data Evaluation:")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Classification Report:\n", classification_report(y_train, y_train_pred))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred))

# Evaluate on test data
y_test_pred = voting_ensemble.predict(X_test)
print("\nTest Data Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Training Data Evaluation:
Accuracy: 0.9526858877086495
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     29238
           1       0.90      0.65      0.76      3712

    accuracy                           0.95     32950
   macro avg       0.93      0.82      0.87     32950
weighted avg       0.95      0.95      0.95     32950

Confusion Matrix:
 [[28964   274]
 [ 1285  2427]]

Test Data Evaluation:
Accuracy: 0.9141781985918912
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.95      7310
           1       0.69      0.43      0.53       928

    accuracy                           0.91      8238
   macro avg       0.81      0.70      0.74      8238
weighted avg       0.90      0.91      0.91      8238

Confusion Matrix:
 [[7134  176]
 [ 531  397]]


# Using SMOTE to tackle undersampling

In [688]:
random_search_svm.best_params_

{'classifier__C': np.float64(8.424426408004217),
 'classifier__gamma': 'auto',
 'classifier__kernel': 'rbf'}

In [689]:
random_search_logreg.best_params_

{'classifier__C': np.float64(3.4370861113902182),
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [690]:
rf_random_search.best_params_

{'classifier__n_estimators': 100,
 'classifier__min_samples_split': 5,
 'classifier__min_samples_leaf': 2,
 'classifier__max_depth': 10}

In [691]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Step 1: Combine preprocessing and feature engineering
resampling_pipeline = ImbPipeline(steps=[
    # Add combined transformer directly instead of as a nested pipeline
    ('combined_transformer', combined_transformer),
    ('normalization_transformer', normalization_transformer),
    ('pca', PCA(n_components=0.95)),  # PCA for feature engineering
    ('smote', SMOTE(random_state=42, sampling_strategy='auto')) , # Apply SMOTE after preprocessing
    # ('classifier', SVC(random_state=42, kernel='rbf'))  # Replace with desired classifier
    # ('classifier', SVC(
    #     C=np.float64(8.424426408004217),
    #     kernel='rbf',
    #     gamma='auto',
    #     random_state=42
    # )),
    # ('classifier', LogisticRegression(
    #     C=np.float64(7.180725777960454),
    #     penalty='l2',
    #     solver='saga',
    # )),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        min_samples_split=2,
        min_samples_leaf=4,
        max_depth=20,
        random_state=42
    )),
])


# Fit the pipeline with resampling on training data
resampling_pipeline.fit(X_train, y_train)

# Evaluate the resampled model on training and test data
y_train_pred_resampled = resampling_pipeline.predict(X_train)
y_test_pred_resampled = resampling_pipeline.predict(X_test)

# Training performance
print("Training Data Evaluation:")
train_accuracy_resampled = accuracy_score(y_train, y_train_pred_resampled)
train_report_resampled = classification_report(y_train, y_train_pred_resampled)
train_conf_matrix_resampled = confusion_matrix(y_train, y_train_pred_resampled)

# Test performance
print("\nTest Data Evaluation:")
test_accuracy_resampled = accuracy_score(y_test, y_test_pred_resampled)
test_report_resampled = classification_report(y_test, y_test_pred_resampled)
test_conf_matrix_resampled = confusion_matrix(y_test, y_test_pred_resampled)

print(train_accuracy_resampled, train_report_resampled, train_conf_matrix_resampled,
 test_accuracy_resampled, test_report_resampled, test_conf_matrix_resampled)


Training Data Evaluation:

Test Data Evaluation:
0.9637329286798179               precision    recall  f1-score   support

           0       1.00      0.96      0.98     29238
           1       0.76      0.99      0.86      3712

    accuracy                           0.96     32950
   macro avg       0.88      0.98      0.92     32950
weighted avg       0.97      0.96      0.97     32950
 [[28065  1173]
 [   22  3690]] 0.8897790725904345               precision    recall  f1-score   support

           0       0.98      0.90      0.94      7310
           1       0.51      0.83      0.63       928

    accuracy                           0.89      8238
   macro avg       0.74      0.86      0.78      8238
weighted avg       0.92      0.89      0.90      8238
 [[6563  747]
 [ 161  767]]
