In [17]:
# Download the dataset
!wget -q https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip

# Unzip the dataset
!unzip -o -q bank.zip


In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score
)

In [19]:
# Install and import imblearn for SMOTE
!pip install -q imblearn
from imblearn.over_sampling import SMOTE

In [20]:
# Import TensorFlow and Keras for Neural Network
!pip install -q tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# For handling warnings
import warnings
warnings.filterwarnings('ignore')

In [21]:
# Load and Preprocess the Data

# Read the bank-full.csv
data = pd.read_csv("bank-full.csv", sep=';')

# Display the first few rows
print("First 5 rows of the dataset:")
print(data.head())

# Display dataset information
print("\nDataset Information:")
print(data.info())

# Display class distribution before preprocessing
print("\nClass Distribution Before Preprocessing:")
print(data['y'].value_counts())


First 5 rows of the dataset:
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entr

In [22]:
# Drop 'duration' to Avoid Data Leakage

# In realistic scenarios, 'duration' is unknown at the time of prediction.
if 'duration' in data.columns:
    data = data.drop(['duration'], axis=1)
    print("\n'Duration' column dropped to prevent data leakage.")
else:
    print("\n'Duration' column not found in the dataset.")


'Duration' column dropped to prevent data leakage.


In [23]:
# Handle Missing Values
# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# If missing values exist, handle them
if missing_values.sum() > 0:
    # For numerical columns, fill with median
    numerical_cols = data.select_dtypes(include=[np.number]).columns
    data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

    # For categorical columns, fill with mode
    categorical_cols = data.select_dtypes(include=['object']).columns
    data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])
    print("\nMissing values have been handled.")
else:
    print("\nNo missing values found.")



Missing Values in Each Column:
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

No missing values found.


In [24]:
# Outlier Detection and Handling for 'previous' Column
# Detect and cap outliers in the 'previous' column using the IQR method
if 'previous' in data.columns:
    Q1 = data['previous'].quantile(0.25)
    Q3 = data['previous'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Capping the outliers
    data['previous'] = np.where(data['previous'] < lower_bound, lower_bound, data['previous'])
    data['previous'] = np.where(data['previous'] > upper_bound, upper_bound, data['previous'])
    print("\nOutliers in 'previous' column have been capped.")
else:
    print("\n'previous' column not found in the dataset.")


Outliers in 'previous' column have been capped.


In [25]:
# Handle 'pdays' Column
# 'pdays' - number of days that passed after the client was last contacted from a previous campaign
# -1 indicates that the client was not previously contacted

# Create a new binary feature indicating if the client was previously contacted
data['previously_contacted'] = np.where(data['pdays'] == -1, 0, 1)

# Replace -1 in 'pdays' with 0 for scaling purposes
data['pdays'] = data['pdays'].replace(-1, 0)

print("\n'previously_contacted' feature created and 'pdays' values adjusted.")



'previously_contacted' feature created and 'pdays' values adjusted.


In [26]:
# Feature Engineering on Numeric Columns
# Define numeric columns
numeric_cols = ['age', 'pdays', 'previous', 'balance', 'campaign', 'day']

# Create new ratio-based features, handling potential division by zero
data['age_balance_ratio'] = data['age'] / (data['balance'] + 1e-6)  # Adding a small constant to avoid division by zero
data['campaign_balance_ratio'] = data['campaign'] / (data['balance'] + 1e-6)
data['previous_campaign_ratio'] = data['previous'] / (data['campaign'] + 1e-6)

print("\nFeature engineering on numeric columns completed.")


Feature engineering on numeric columns completed.


In [28]:
# Encode Categorical Variables
# Identify categorical columns (excluding the target variable 'y')
categorical_cols = data.drop('y', axis=1).select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_cols.tolist())

# One-Hot Encoding for categorical variables
X = pd.get_dummies(data.drop('y', axis=1), columns=categorical_cols, drop_first=True)
print("\nShape of feature matrix after one-hot encoding:", X.shape)

# Convert target variable to binary
y = data['y'].map({'yes': 1, 'no': 0})

# Display class distribution before SMOTE
print("\nClass Distribution Before SMOTE:")
print(y.value_counts())


Categorical Columns: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

Shape of feature matrix after one-hot encoding: (45211, 45)

Class Distribution Before SMOTE:
y
0    39922
1     5289
Name: count, dtype: int64


In [29]:
# Apply SMOTE for Oversampling Before Scaling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\nClass Distribution After SMOTE:")
print(pd.Series(y_resampled).value_counts())


Class Distribution After SMOTE:
y
0    39922
1    39922
Name: count, dtype: int64


In [30]:
# Scale Only Numerical Features
# Define all numeric columns including the newly created features
scaled_numeric_cols = [
    'age', 'pdays', 'previous', 'balance', 'campaign', 'day',
    'age_balance_ratio', 'campaign_balance_ratio', 'previous_campaign_ratio'
]

# Check if all scaled_numeric_cols are in X_resampled
missing_scaled_numeric = set(scaled_numeric_cols) - set(X_resampled.columns)
if missing_scaled_numeric:
    print(f"\nWarning: The following numerical columns are missing and will be skipped for scaling: {missing_scaled_numeric}")
    scaled_numeric_cols = list(set(scaled_numeric_cols) - missing_scaled_numeric)
else:
    print("\nAll numeric columns identified for scaling.")



All numeric columns identified for scaling.


In [31]:
# Extract numerical columns for scaling
X_numerical = X_resampled[scaled_numeric_cols]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numerical features
X_numerical_scaled = scaler.fit_transform(X_numerical)

# Create a DataFrame for scaled numerical features
X_numerical_scaled_df = pd.DataFrame(
    X_numerical_scaled,
    columns=scaled_numeric_cols,
    index=X_resampled.index
)

# Drop original numerical columns from X_resampled
X_resampled = X_resampled.drop(columns=scaled_numeric_cols)

# Concatenate scaled numerical features with the rest of the data
X_resampled = pd.concat([X_resampled, X_numerical_scaled_df], axis=1)

print("\nShape of feature matrix after scaling numerical features:", X_resampled.shape)



Shape of feature matrix after scaling numerical features: (79844, 45)


In [32]:
# Split the Data into Train and Test Sets
X_final = X_resampled
y_final = y_resampled

X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final,
    test_size=0.2,  # 20% for testing
    random_state=42,
    stratify=y_final
)

print("\nTraining Set Class Distribution:")
print(pd.Series(y_train).value_counts())
print("\nTesting Set Class Distribution:")
print(pd.Series(y_test).value_counts())


Training Set Class Distribution:
y
1    31938
0    31937
Name: count, dtype: int64

Testing Set Class Distribution:
y
0    7985
1    7984
Name: count, dtype: int64


In [34]:
# Hyperparameter Tuning for Random Forest using RandomizedSearchCV
# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring='roc_auc',
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit RandomizedSearchCV to the training data
print("\nStarting RandomizedSearchCV for Hyperparameter Tuning...")
random_search.fit(X_train, y_train)

# Retrieve the best parameters
best_params_rf = random_search.best_params_
print("\nBest Parameters Found for Random Forest:")
print(best_params_rf)


Starting RandomizedSearchCV for Hyperparameter Tuning...
Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best Parameters Found for Random Forest:
{'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}
