## Load the Libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from imblearn.combine import SMOTEENN
from sklearn.ensemble import AdaBoostClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/predict-the-success-of-bank-telemarketing/sample_submission.csv
/kaggle/input/predict-the-success-of-bank-telemarketing/train.csv
/kaggle/input/predict-the-success-of-bank-telemarketing/test.csv


## Load the Datasets

In [2]:
# Load the datasets
train = pd.read_csv('/kaggle/input/predict-the-success-of-bank-telemarketing/train.csv')
test = pd.read_csv('/kaggle/input/predict-the-success-of-bank-telemarketing/test.csv')

### Unique Values

In [None]:
for feature in train.columns:
    print(feature,":", train[feature].nunique())

### Missing Values

In [4]:
train[['education', 'contact', 'poutcome']] = train[['education', 'contact', 'poutcome']].replace("unknown", np.nan)
train['pdays'] = train['pdays'].replace(-1, np.nan)

In [None]:
train.isna().sum() * 100 / len(train)

In [6]:
test[['education', 'contact', 'poutcome']] = test[['education', 'contact', 'poutcome']].replace("unknown", np.nan)
test['pdays'] = test['pdays'].replace(-1, np.nan)

In [None]:
test.isna().sum() * 100 / len(test)

In [8]:
# Calculate the maximum balance value from the training dataset
max_balance = train['balance'].max()

# Define the thresholds for the categories
threshold1 = max_balance / 3
threshold2 = 2 * threshold1

# Function to categorize balance
def categorize_balance(balance):
    if balance < 0:
        return 'In Debt'
    elif 0 <= balance < threshold1:
        return 'Poor'
    elif threshold1 <= balance < threshold2:
        return 'Middle Class'
    else:
        return 'Rich'

In [9]:
# Converting the 'last contact date' column to datetime
train['last contact date'] = pd.to_datetime(train['last contact date'])

# Extracting components from the 'last contact date'
train['last_contact_year'] = train['last contact date'].dt.year
train['last_contact_month'] = train['last contact date'].dt.month
train['last_contact_day'] = train['last contact date'].dt.day
train['day_of_week'] = train['last contact date'].dt.dayofweek
train['is_weekend'] = train['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
train['quarter'] = train['last contact date'].dt.quarter
train['balance_category'] = train['balance'].apply(categorize_balance)

# Drop 'last contact date' as it's no longer needed, and 'poutcome' and 'pdays' as it has more that 75% of missing data
train.drop(['last contact date', 'poutcome', 'pdays'], axis=1, inplace=True)

In [10]:
# Preprocess the test dataset (apply the same preprocessing steps)
test['last contact date'] = pd.to_datetime(test['last contact date'])

test['last_contact_year'] = test['last contact date'].dt.year
test['last_contact_month'] = test['last contact date'].dt.month
test['last_contact_day'] = test['last contact date'].dt.day
test['day_of_week'] = test['last contact date'].dt.dayofweek
test['is_weekend'] = test['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
test['quarter'] = test['last contact date'].dt.quarter
test['balance_category'] = test['balance'].apply(categorize_balance)

# Drop the 'last contact date', 'poutcome' and 'pdays' from test
test.drop(['last contact date', 'poutcome', 'pdays'], axis=1, inplace=True)

## EDA

In [None]:
train.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.describe(include = ['object'])

In [None]:
train['target'].value_counts()

In [None]:
train['is_weekend'].value_counts()

In [None]:
train['quarter'].value_counts()

In [None]:
ax = sns.histplot(train, x="age")
ax.set_xlabel('Age')
ax.set_ylabel('Distribution')
ax.set_title('Distribution of Age')

In [None]:
ax = sns.histplot(train, x="last_contact_month")
ax.set_xlabel('Month')
ax.set_ylabel('Distribution')
ax.set_title('Distribution of Last Contact Month')

In [None]:
ax = sns.histplot(train, x="balance_category", hue="target")
ax.set_xlabel('Social ')
ax.set_ylabel('Distribution')
ax.set_title('Distribution of Social Class')

# Inferences
The DataSet is **Highly Imbalanced**.

- Customer is contacted on Weekdays.
- Banks contact their customers mostly in 2nd Quarter and 3rd Quarter.
- Most of the calls are done in May.
- Customer's age group is right skewed. Most of them age from 30-40.
- Customers who has credit in default are low in number but also has high chance to subscribe a term deposit.
- The mean of contacts performed previously is about 11.8 with a standard deviation of 44.14 shows that most people received very few contacts, but a small number received significantly more (up to 275).
- Almost all middle class and rich class people have subscibed and people with debt are not subscribing for term deposit.
- Columns duration, balance, campaign and previous are positively correlated to each other.

## Preprocessing

In [22]:
# Separate the target column
y = train.target.copy()

# Encode the target column separately (yes/no -> 1/0)
target_mapping = {'yes': 1, 'no': 0}
y = y.map(target_mapping)

X = train.drop('target', axis=1)

Split the data into **Training & Validation**

In [23]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.80, random_state=42)

In [24]:
# Columns to apply OrdinalEncoder
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'last_contact_year',
                    'last_contact_month', 'last_contact_day', 'day_of_week', 'is_weekend', 'quarter', 'balance_category']

preprocessor = ColumnTransformer(transformers=[
    ('num_standard', Pipeline([
        ('scaler', StandardScaler())
    ]), ['age', 'duration', 'balance']),
    
    ('num_counts', Pipeline([
        ('scaler', MinMaxScaler())
    ]), ['campaign', 'previous']),
    
    # OrdinalEncoder for categorical features
    ('cat', Pipeline([
        ('encoder', OrdinalEncoder()),
        ('imputer', IterativeImputer(initial_strategy="constant"))
    ]), categorical_cols)
]).set_output(transform='pandas')

In [25]:
# X_train_processed = preprocessor.fit_transform(X_train)
# X_val_processed = preprocessor.transform(X_val)

In [26]:
# X_train_processed.corr()

In [27]:
# # Apply SMOTE to balance the class distribution
# smote = SMOTEENN(random_state=42)
# X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y_train)

In [28]:
# # Define the RandomForestClassifier with class_weight='balanced'
# rf_balanced = RandomForestClassifier(
#     n_estimators=200,         # Number of trees
#     class_weight='balanced',  # Handle class imbalance
#     random_state=42,          # Reproducibility
#     max_depth=20,             # Limit the depth of each tree (regularization)
# )

# # Perform Recursive Feature Elimination (RFE)
# rfe = RFE(estimator=rf_balanced, n_features_to_select=15)

# # Fit and transform the training data
# X_train_rfe = rfe.fit_transform(X_train_smote, y_train_smote)

# # Transform the validation data
# X_val_rfe = rfe.transform(X_val_processed)

# # Wrap Random Forest with AdaBoost
# ada_rf = AdaBoostClassifier(
#     base_estimator=rf_balanced,  # Use the Random Forest model
#     n_estimators=50,            # Number of boosting stages
#     learning_rate=0.1,          # Controls the contribution of each model
#     random_state=42
# )

# # Fit the model with the reduced features and data subset
# ada_rf.fit(X_train_rfe, y_train_smote)
# rf_predictions = ada_rf.predict(X_val_rfe)

In [29]:
# # Define Logistic Regression with a solver that supports both 'l1' and 'l2' penalties
# lr = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42, solver='saga', penalty='l1', C=0.01)

# # Perform Recursive Feature Elimination (RFE)
# rfe = RFE(estimator=lr, n_features_to_select=15)

# # Fit and transform the training data using RFE
# X_train_rfe = rfe.fit_transform(X_train_smote, y_train_smote)

# # Transform the validation data
# X_val_rfe = rfe.transform(X_val_processed)

# # # Fit the GridSearchCV on the RFE-transformed training data
# # # Define the parameter grid for Logistic Regression
# # lr_param_grid = {
# #     'penalty': ['l1', 'l2'],              # Regularization types
# #     'C': [0.01, 0.1, 1, 10],             # Inverse regularization strength
# #     'solver': ['saga'],                  # Saga supports both l1 and l2
# #     'class_weight': ['balanced']         # Handle class imbalance
# # }
# # # Perform GridSearchCV for Logistic Regression
# # lr_grid_search = GridSearchCV(estimator=lr, param_grid=lr_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# lr.fit(X_train_rfe, y_train_smote)

# # Make predictions on the validation data
# lr_predictions = lr.predict(X_val_rfe)

In [30]:
# # Define the KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors=3,weights='distance',metric='manhattan')

# # Perform Recursive Feature Elimination (RFE)
# # Note: KNN doesn't have built-in feature importance, so we use a Logistic Regression estimator for RFE
# lr_for_rfe = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42, solver='saga')
# rfe = RFE(estimator=lr_for_rfe, n_features_to_select=15)

# # Fit and transform the training data using RFE
# X_train_rfe = rfe.fit_transform(X_train_smote, y_train_smote)

# # Transform the validation data
# X_val_rfe = rfe.transform(X_val_processed)

# # # Fit the GridSearchCV on the RFE-transformed training data
# # # Define the parameter grid for KNeighborsClassifier
# # knn_param_grid = {
# #     'n_neighbors': [3, 5, 7, 9],             # Number of neighbors to consider
# #     'weights': ['uniform', 'distance'],      # Weight function
# #     'metric': ['euclidean', 'manhattan']     # Distance metrics
# # }
# # knn_grid_search = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# knn.fit(X_train_rfe, y_train_smote)

# # Make predictions on the validation data
# knn_predictions = knn.predict(X_val_rfe)

In [31]:
# # Function to plot a confusion matrix
# def plot_confusion_matrix(cm, title):
#     plt.figure(figsize=(6, 4))
#     sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
#     plt.xlabel('Predicted Label')
#     plt.ylabel('True Label')
#     plt.title(title)
#     plt.show()

# # Confusion Matrix and Classification Report for Random Forest
# rf_cm = confusion_matrix(y_val, rf_predictions)
# plot_confusion_matrix(rf_cm, "Confusion Matrix - Random Forest")
# print("Classification Report - Random Forest:")
# print(classification_report(y_val, rf_predictions, target_names=['No', 'Yes']))

# # Confusion Matrix and Classification Report for Logistic Regression
# lr_cm = confusion_matrix(y_val, lr_predictions)
# plot_confusion_matrix(lr_cm, "Confusion Matrix - Logistic Regression")
# print("Classification Report - Logistic Regression:")
# print(classification_report(y_val, lr_predictions, target_names=['No', 'Yes']))

# # Confusion Matrix and Classification Report for KNN
# knn_cm = confusion_matrix(y_val, knn_predictions)
# plot_confusion_matrix(knn_cm, "Confusion Matrix - KNN")
# print("Classification Report - KNN:")
# print(classification_report(y_val, knn_predictions, target_names=['No', 'Yes']))

# Inferences

1) Overall Performance Across Models:
    - Random Forest achieves the highest accuracy (85%) among the three models.
    - Logistic Regression has a slightly lower accuracy (79%), while KNN achieves 81%.
    - In terms of weighted average F1-score, Random Forest is the best (0.87), followed by Logistic Regression (0.81) and KNN (0.82).

2) Class-Specific Performance:
    - Class 0 ("No"): All three models perform well for the majority class (No), with Random Forest achieving the best precision (96%) and F1-score (0.91).
    - Class 1 ("Yes"): Random Forest performs best for the minority class (Yes), with a recall of 79% and F1-score of 0.62.

3) Class Imbalance Effects:
    - All three models favor the majority class (No), as seen from their higher precision, recall, and F1-scores for this class.
    - Random Forest appears to handle class imbalance better than the others, as it maintains a higher recall (80%) for the minority class.

## Model Development

In [32]:
X_train_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(test)

In [33]:
# Apply SMOTE to balance the class distribution
smote = SMOTEENN(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y)

### Random Forest Classifier

In [34]:
# Define the RandomForestClassifier with class_weight='balanced'
rf_balanced = RandomForestClassifier(
    n_estimators=200,         # Number of trees
    class_weight='balanced',  # Handle class imbalance
    random_state=42,          # Reproducibility
    max_depth=15,             # Limit the depth of each tree (regularization)
)

# Perform Recursive Feature Elimination (RFE)
rfe = RFE(estimator=rf_balanced, n_features_to_select=15)

# Fit and transform the training data
X_train_rfe = rfe.fit_transform(X_train_smote, y_train_smote)

# Transform the validation data
X_test_rfe = rfe.transform(X_test_processed)

# Wrap Random Forest with AdaBoost
ada_rf = AdaBoostClassifier(
    base_estimator=rf_balanced,  # Use the Random Forest model
    n_estimators=250,            # Number of boosting stages
    learning_rate=0.1,          # Controls the contribution of each model
    random_state=42
)

In [35]:
# Fit the model with the reduced features and data subset
ada_rf.fit(X_train_rfe, y_train_smote)
test_predictions = ada_rf.predict(X_test_rfe)



### Logistic Regression

In [36]:
# # Define Logistic Regression with a solver that supports both 'l1' and 'l2' penalties
# lr = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42, solver='saga', penalty='l1', C=0.01)


# # Perform Recursive Feature Elimination (RFE)
# rfe = RFE(estimator=lr, n_features_to_select=15)

# # Fit and transform the training data using RFE
# X_train_rfe = rfe.fit_transform(X_train_smote, y_train_smote)

# # Transform the validation data
# X_test_rfe = rfe.transform(X_test_processed)

# # Fit the GridSearchCV on the RFE-transformed training data
# lr.fit(X_train_rfe, y_train_smote)

# # Make predictions on the validation data
# test_predictions = lr.predict(X_test_rfe)

### K Nearest Neighbours

In [37]:
# # Define the KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors=3,weights='distance',metric='manhattan')

# # Perform Recursive Feature Elimination (RFE)
# # Note: KNN doesn't have built-in feature importance, so we use a Logistic Regression estimator for RFE
# lr_for_rfe = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42, solver='saga')
# rfe = RFE(estimator=lr_for_rfe, n_features_to_select=15)

# # Fit and transform the training data using RFE
# X_train_rfe = rfe.fit_transform(X_train_smote, y_train_smote)

# # Transform the validation data
# X_test_rfe = rfe.transform(X_test_processed)

# # Fit the GridSearchCV on the RFE-transformed training data
# knn.fit(X_train_rfe, y_train_smote)

# # Make predictions on the validation data
# test_predictions = knn.predict(X_test_rfe)

# Submission

In [38]:
# Map predictions to original labels
inverse_target_mapping = {1: 'yes', 0: 'no'}
test_predictions_mapped = pd.Series(test_predictions).map(inverse_target_mapping)

# Prepare submission
submission_df = pd.DataFrame({
    'id': test.index,
    'target': test_predictions_mapped
})
submission_df.to_csv('submission.csv', index=False)