In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

| Variable           | Description                                         | Purpose                                               | Impact                                                                                                        | Data Type   |
|--------------------|-----------------------------------------------------|-------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------|
| last contact date  | Date of the last contact                            | Measures recentness of engagement                     | Recency of contact may influence the client's likelihood to respond positively                                | Date        |
| age                | Age of the client                                   | Indicates life stage and financial maturity           | Older clients may have higher savings and investment potential                                               | Numeric     |
| job                | Type of job                                         | Reflects socioeconomic background                     | Certain jobs may correlate with higher financial stability, affecting deposit likelihood                     | Categorical |
| marital            | Marital status                                      | Indicates family structure and financial priorities   | Married individuals may be more inclined toward savings and long-term deposits                               | Categorical |
| education          | Level of education                                  | Represents knowledge level and career potential       | Higher education may correlate with increased financial literacy, leading to more interest in deposits       | Categorical |
| default            | Has credit in default?                              | Measures past financial risk                          | Clients with credit in default may be less likely to make new financial commitments                          | Binary      |
| balance            | Average yearly balance, in euros                    | Indicates financial standing and liquidity            | Higher balances indicate a client’s potential ability to make deposits                                       | Numeric     |
| housing            | Has housing loan?                                   | Measures existing financial obligations               | Clients with housing loans might have limited capacity for new financial commitments                         | Binary      |
| loan               | Has personal loan?                                  | Measures existing personal debt                       | Clients with personal loans may have restricted funds for deposits                                           | Binary      |
| contact            | Type of contact communication                       | Reflects ease and mode of contact                     | Clients contacted via cellular may have more accessibility for follow-ups                                    | Categorical |
| duration           | Last contact duration, in seconds                   | Measures engagement quality                           | Longer durations can indicate meaningful interactions, possibly increasing deposit likelihood                | Numeric     |
| campaign           | Number of contacts during this campaign             | Measures campaign intensity                           | Higher contact frequency may improve familiarity, potentially increasing response rates                       | Numeric     |
| pdays              | Days since last contact in a previous campaign      | Indicates past campaign engagement timing             | Longer gaps may reduce familiarity, impacting client receptiveness                                          | Numeric     |
| previous           | Number of contacts before this campaign             | Tracks historical engagement with the client          | More prior engagements may build client relationship, increasing the probability of a positive response      | Numeric     |
| poutcome           | Outcome of previous marketing campaign              | Measures past campaign success                        | Success in prior campaigns suggests receptiveness, making these clients better prospects                     | Categorical |
| target             | Has the client subscribed to a term deposit?        | Represents the marketing campaign outcome             | Success metric, indicating campaign effectiveness and areas for improvement                                  | Binary      |

**EDA(Exploratory Data Analysis)**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [None]:
train = pd.read_csv('/kaggle/input/predict-the-success-of-bank-telemarketing/train.csv')
test = pd.read_csv('/kaggle/input/predict-the-success-of-bank-telemarketing/test.csv')
submission_path = pd.read_csv('/kaggle/input/predict-the-success-of-bank-telemarketing/sample_submission.csv')

In [None]:
print(train.columns)
print("Number of variables:",train.columns.shape[0])

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
print(train['target'].value_counts(normalize=True) * 100)

# Visualize the imbalance
sns.countplot(x='target', data=train)
plt.title('Distribution of target')
plt.show()

In [None]:
train.isnull().sum()

In [None]:
# Plot missing values matrix for the training dataset
msno.matrix(train)
plt.show()

In [None]:
print("Default variables:",train.default.unique())
print("Housing variables:",train.housing.unique())
print("Loan variables:",train.loan.unique())
print("Target variables:",train.target.unique())

**Preprocessing**

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

In [None]:
def convert_to_datetime_features(df):
    df = df.copy()
    df['last_contact_year'] = pd.to_datetime(df['last contact date'], errors='coerce').dt.year
    df['last_contact_month'] = pd.to_datetime(df['last contact date'], errors='coerce').dt.month
    df['last_contact_day'] = pd.to_datetime(df['last contact date'], errors='coerce').dt.day
    return df.drop(columns=['last contact date'])


datetime_pipeline = Pipeline([
    ('convert_date', FunctionTransformer(convert_to_datetime_features, validate=False))
])


In [None]:
education_order = ['primary', 'secondary', 'tertiary']
job_order = [
    'blue-collar', 'management', 'technician', 'admin.', 'services',
    'self-employed', 'retired', 'entrepreneur', 'unemployed', 'housemaid', 'student'
]

impute_and_encode_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=[education_order, job_order]))
])

In [None]:
def replace_other_with_nan(df):
    df = df.copy()
    df['poutcome'] = df['poutcome'].replace('other', np.nan)
    return df


impute_and_onehot_pipeline = Pipeline([
    ('replace_other', FunctionTransformer(replace_other_with_nan, validate=False)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False))
])

In [None]:
def create_marital_columns(df):
    df = df.copy()
    df['married'] = df['marital'].apply(lambda x: 1 if x == 'married' else 0)
    df['single'] = df['marital'].apply(lambda x: 1 if x == 'single' else 0)
    df['divorced'] = df['marital'].apply(lambda x: 1 if x == 'divorced' else 0)
    return df.drop(columns=['marital'])

marital_pipeline = Pipeline([
    ('create_columns', FunctionTransformer(create_marital_columns, validate=False)),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False))
])

In [None]:
datetime_columns = ['last contact date']
ordinal_encode_categorical_columns = ['education', 'job']
standard_cols = ['age']
robust_cols = ['balance','pdays']
minmax_cols = ['duration', 'campaign', 'previous']
binary_columns = ['default', 'housing', 'loan']
na_columns = ['contact', 'poutcome']
marital_columns = ['marital']

**StandardScaler:** Best for features that are approximately normally distributed, as it centers the data by subtracting the mean and scaling to unit variance.

**MinMaxScaler:** Useful for features with a positive range or bounded scale, as it normalizes values to a specific range (typically 0–1), preserving feature structure.

**RobustScaler:** Ideal for features with potential outliers, as it scales values using the median and interquartile range (IQR) rather than mean and standard deviation, reducing the influence of outliers.

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('convert_date', datetime_pipeline, datetime_columns),
        ('na_encode_pipeline', impute_and_encode_pipeline, ordinal_encode_categorical_columns),
        ('standard', StandardScaler(), standard_cols),
        ('robust', RobustScaler(), robust_cols),
        ('minmax', MinMaxScaler(), minmax_cols),
        ('Label', OneHotEncoder(drop='first', sparse_output=False), binary_columns),
        ('na_onehot', impute_and_onehot_pipeline, na_columns),
        ('marital', marital_pipeline, marital_columns)
    ],
    remainder='passthrough'
)

In [None]:
y = train['target']
X = train.drop('target', axis=1)
preprocessor.fit(X)
train = preprocessor.transform(X)
test = preprocessor.transform(test)

In [None]:
transformed_columns = ['last_contact_year', 'last_contact_month', 'last_contact_day', 'education', 'job','age','balance','pdays','duration',
                       'campaign','previous','default', 'housing', 'loan','contact', 'poutcome','married','single','divorced']


train = pd.DataFrame(train, columns=transformed_columns)
test = pd.DataFrame(test, columns=transformed_columns)

train.head()


In [None]:
correlation_matrix = train.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(class_weight='balanced', max_iter=500))
])


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_val)


print(f"F1 Score: {f1_score(y_val, y_pred, average='macro')}")


In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_val)
print(f"F1 Score: {f1_score(y_val, y_pred, average='macro')}")

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

y_train_encoded = y_train.map({'yes': 1, 'no': 0})
y_val_encoded = y_val.map({'yes': 1, 'no': 0})


if len(y_train_encoded[y_train_encoded == 1]) > 0:
    scale_pos_weight = len(y_train_encoded[y_train_encoded == 0]) / len(y_train_encoded[y_train_encoded == 1])
else:
    scale_pos_weight = 1

xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)

xgb_model.fit(X_train, y_train_encoded)
y_pred = xgb_model.predict(X_val)
print(f"F1 Score: {f1_score(y_val_encoded, y_pred, average='macro')}")

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train_encoded)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best F1 Score from cross-validation: {grid_search.best_score_}")

In [None]:
from sklearn.model_selection import cross_val_score

xgb_model_final = grid_search.best_estimator_
# Perform cross-validation and calculate the F1 score
cv_f1_scores = cross_val_score(xgb_model_final, X_train, y_train_encoded, cv=5, scoring='f1_macro')
print(f"Cross-validated F1 scores: {cv_f1_scores}")
print(f"Mean F1 score: {cv_f1_scores.mean()}")

In [None]:
from sklearn.decomposition import PCA

scores = []
for n in range(1, X_train.shape[1]+1):
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train)

    score = cross_val_score(xgb_model_final, X_train_pca, y_train_encoded, cv=5, scoring='f1_macro')
    scores.append(score.mean())


plt.plot(range(1, X_train.shape[1]+1), scores, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cross-validated F1 Score')
plt.title('Cross-validated F1 Score vs. Number of PCA Components')
plt.show()

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif


scores = []

for k in range(1, X_train.shape[1]+1):
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train_encoded)
    score = cross_val_score(xgb_model_final, X_train_selected, y_train_encoded, cv=5)
    scores.append(score.mean())

# Plot the results to find the optimal k
plt.plot(range(1, X_train.shape[1]+1), scores, marker='o')
plt.xlabel('Number of Features (k)')
plt.ylabel('Cross-Validation Score')
plt.title('Model Performance vs. Number of Features')
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train_encoded)
y_pred_nb = nb_model.predict(X_val)
f1_nb = f1_score(y_val_encoded, y_pred_nb, average='macro')
print(f"F1 Score with Naive Bayes: {f1_nb}")

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train_encoded)
y_pred_knn = knn_model.predict(X_val)
f1_knn = f1_score(y_val_encoded, y_pred_knn, average='macro')
print(f"F1 Score with KNN: {f1_knn}")

In [None]:
from sklearn.svm import SVC

# Support Vector Machines (SVM)
svm_model = SVC(kernel='linear', class_weight='balanced')
svm_model.fit(X_train, y_train_encoded)
y_pred_svm = svm_model.predict(X_val)
f1_svm = f1_score(y_val_encoded, y_pred_svm, average='macro')
print(f"F1 Score with SVM: {f1_svm}")

In [None]:
from sklearn.linear_model import SGDClassifier

# Initialize SGDClassifier model
sgd_model = SGDClassifier(loss='log_loss', penalty='l2', max_iter=1000, random_state=42, class_weight='balanced')
sgd_model.fit(X_train, y_train_encoded)
y_pred_sgd = sgd_model.predict(X_val)
f1_sgd = f1_score(y_val_encoded, y_pred_sgd, average='macro')
print(f"F1 Score with SGD: {f1_sgd}")

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(class_weight='balanced', max_iter=500))
])
bagging_model = BaggingClassifier(estimator=model, n_estimators=10, random_state=42)

bagging_model.fit(X_train, y_train_encoded)
y_pred_bagging = bagging_model.predict(X_val)

f1_bagging = f1_score(y_val_encoded, y_pred_bagging, average='macro')
print(f"F1 Score with Bagging: {f1_bagging}")

In [None]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ('rf', RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=100)),
    ('xgb', XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42))
]

stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(class_weight='balanced', random_state=42, max_iter=500)
)

stacking_model.fit(X_train, y_train_encoded)
y_pred_stacking = stacking_model.predict(X_val)

f1_stacking = f1_score(y_val_encoded, y_pred_stacking, average='macro')
print(f"F1 Score with Stacking: {f1_stacking}")

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score

xgb_model = XGBClassifier(
    scale_pos_weight=len(y_train_encoded[y_train_encoded == 0]) / len(y_train_encoded[y_train_encoded == 1]),
    n_estimators=200,
    max_depth=7,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

rf_model = RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=100)

lr_model = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(class_weight='balanced', max_iter=500))
])

voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('lr', lr_model)
    ],
    voting='soft'
)


voting_clf.fit(X_train, y_train_encoded)
y_val_pred_voting = voting_clf.predict(X_val)

f1_voting = f1_score(y_val_encoded, y_val_pred_voting, average='macro')
print(f"F1 Score with Voting Classifier on validation set: {f1_voting}")

In [None]:
from sklearn.model_selection import RandomizedSearchCV

xgb_param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

xgb_random_search = RandomizedSearchCV(
    XGBClassifier(scale_pos_weight=len(y_train_encoded[y_train_encoded == 0]) / len(y_train_encoded[y_train_encoded == 1]), random_state=42),
    param_distributions=xgb_param_dist,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    random_state=42,
    verbose=2
)

xgb_random_search.fit(X_train, y_train_encoded)
best_xgb = xgb_random_search.best_estimator_

print(f"Best XGBoost Parameters: {xgb_random_search.best_params_}")

In [None]:
rf_param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Randomized search for RandomForest
rf_random_search = RandomizedSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state=42),
    param_distributions=rf_param_dist,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    random_state=42,
    verbose=2
)

rf_random_search.fit(X_train, y_train_encoded)
best_rf = rf_random_search.best_estimator_

print(f"Best RandomForest Parameters: {rf_random_search.best_params_}")

In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf_tuned = VotingClassifier(
    estimators=[
        ('xgb', best_xgb),
        ('rf', best_rf),
        ('lr', lr_model)
    ],
    voting='soft'
)


voting_clf_tuned.fit(X_train, y_train_encoded)
y_val_pred_voting_tuned = voting_clf_tuned.predict(X_val)


f1_voting_tuned = f1_score(y_val_encoded, y_val_pred_voting_tuned, average='macro')
print(f"F1 Score with Tuned Voting Classifier on validation set: {f1_voting_tuned}")

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, f1_score, confusion_matrix

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_val_encoded, y_val_pred_voting_tuned)

disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=["No", "Yes"])
disp.plot(cmap="Blues")
plt.title('Confusion Matrix for Tuned Voting Classifier')
plt.show()

In [None]:
# Classification Report
class_report = classification_report(y_val_encoded, y_val_pred_voting_tuned, target_names=["No", "Yes"])
print("Classification Report:\n", class_report)

In [None]:
from imblearn.over_sampling import SMOTE

# Oversampling with SMOTE
smote = SMOTE(sampling_strategy='minority')
X_resampled, y_resampled = smote.fit_resample(X_train, y_train_encoded)


In [None]:
from sklearn.model_selection import RandomizedSearchCV

xgb_param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

xgb_random_search = RandomizedSearchCV(
    XGBClassifier(scale_pos_weight=len(y_resampled[y_resampled == 0]) / len(y_resampled[y_resampled == 1]), random_state=42),
    param_distributions=xgb_param_dist,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    random_state=42,
    verbose=2
)

xgb_random_search.fit(X_resampled, y_resampled)
best_xgb = xgb_random_search.best_estimator_

print(f"Best XGBoost Parameters: {xgb_random_search.best_params_}")

In [None]:
rf_param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Randomized search for RandomForest
rf_random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=rf_param_dist,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    random_state=42,
    verbose=2
)

rf_random_search.fit(X_resampled, y_resampled)
best_rf = rf_random_search.best_estimator_

print(f"Best RandomForest Parameters: {rf_random_search.best_params_}")

In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf_tuned = VotingClassifier(
    estimators=[
        ('xgb', best_xgb),
        ('rf', best_rf),
        ('lr', lr_model)
    ],
    voting='soft'
)


voting_clf_tuned.fit(X_resampled, y_resampled)
y_val_pred_voting_tuned = voting_clf_tuned.predict(X_val)


f1_voting_tuned = f1_score(y_val_encoded, y_val_pred_voting_tuned, average='macro')
print(f"F1 Score with Tuned Voting Classifier on validation set: {f1_voting_tuned}")

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, f1_score, confusion_matrix

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_val_encoded, y_val_pred_voting_tuned)

disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=["No", "Yes"])
disp.plot(cmap="Blues")
plt.title('Confusion Matrix for Tuned Voting Classifier')
plt.show()

In [None]:
# Classification Report
class_report = classification_report(y_val_encoded, y_val_pred_voting_tuned, target_names=["No", "Yes"])
print("Classification Report:\n", class_report)

In [None]:
test_pred_proba_voting_tuned = voting_clf_tuned.predict_proba(test)

test_pred_voting_tuned = (test_pred_proba_voting_tuned[:, 1] > 0.5).astype(int)

test_pred_mapped_voting_tuned = pd.Series(test_pred_voting_tuned).map({1: 'yes', 0: 'no'})

submission = pd.DataFrame({
    'id': test.index,
    'target': test_pred_mapped_voting_tuned
})

submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")