# 1. Data Presentation


### Import essential libraries:

- for data manipulation and visualization: Numpy, Pandas, Seaborn, matplotlib

- for preprocessing and handling data imbalance: sklearn.preprocessing, sklearn.impute, imblearn.under_sampling

- for text processing: sklearn.feature_extraction.text

- for model building and evaluation: sklearn.model_selection, sklearn.metrics

- Machine learning models: sklearn.linear_model, sklearn.esemble, sklearn.svm, sklearn.neighbors, sklearn.naive_bayes

- Gradient boosting models: xgboost, lightgbm

- for building pipelines and transformations: sklearn.pipeline, sklearn.compose

- for statistical tests: scipy.stats

- Miscellaneous: warnings

In [None]:
# === Core packages ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# === Data preprocessing ===
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# === Data splitting and evaluation ===
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_validate, RandomizedSearchCV
)
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    balanced_accuracy_score, matthews_corrcoef,
    classification_report, confusion_matrix,
    RocCurveDisplay, PrecisionRecallDisplay, make_scorer
)

# === Models ===
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    StackingClassifier
)

# === Optional (class imbalance & optimization) ===
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import randint, uniform

# === Utilities ===
import warnings
warnings.filterwarnings('ignore')

# Plot settings
plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 32)

### Loading the dataset

In [None]:
df = pd.read_csv("bank_marketing/bank.csv", sep=";")

### First 5 rows of the dataset

In [None]:
df.head()

### Last 5 rows of the dataset

In [None]:
df.tail()

### Checking how many columns (features + target) does the set contain

In [None]:
len(df.columns)

### Checking what columns the dataset has

In [None]:
df.columns

### Checking the type of the colums

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
num_cols = [c for c in df.columns if c not in cat_cols]

print('\nCategorical columns:', cat_cols)
print('Numerical columns:', num_cols)

### Checking the shape (rows, columns) in DataFrame

In [None]:
df.shape

### Checking the size (number of cells) in DataFrame

In [None]:
df.size

### An overview of the data types, missing values, percentage of missing values, and the number of unique values for each column in the DataFrame

In [None]:
num_cols = df.select_dtypes(exclude='object').columns

data_info_num = pd.DataFrame({
    'Data Type': df[num_cols].dtypes,
    'Missing Values': df[num_cols].isnull().sum(),
    'Percentage Missing': (df[num_cols].isnull().sum() / len(df)) * 100,
    'Unique Values': df[num_cols].nunique()
})

data_info_num = data_info_num.sort_values(by='Missing Values', ascending=False)
data_info_num.style.format({'Percentage Missing': '{:.2f}%'})

In [None]:
cat_cols = df.select_dtypes(include='object').columns

cat_info = pd.DataFrame({
    'Data Type': df[cat_cols].dtypes,
    'Unknown Count': [(df[c] == 'unknown').sum() for c in cat_cols],
    'Unknown (%)': [(df[c].eq('unknown').sum() / len(df)) * 100 for c in cat_cols],
    'Unique Values': [df[c].nunique() for c in cat_cols]
})

cat_info = cat_info.sort_values(by='Unknown Count', ascending=False)
cat_info.style.format({'Unknown (%)': '{:.2f}%'})

### Check correlation between numerical variables

In [None]:
# Select numerical columns
num_cols = df.select_dtypes(include=['int64','float64']).columns

# Calculate correlation matrix
corr = df[num_cols].corr()

# Show heatmap of correlations
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation between numerical variables")
plt.show()

### Check correlation between categorical variables

In [None]:
def cramers_v(x, y):
    """Compute Cramér's V statistic for categorical-categorical association."""
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix, correction=False)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1))
    rcorr = r - ((r-1)**2) / (n-1)
    kcorr = k - ((k-1)**2) / (n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cat_cols = df.select_dtypes(include='object').columns

cramers_results = pd.DataFrame(index=cat_cols, columns=cat_cols, dtype=float)

for c1 in cat_cols:
    for c2 in cat_cols:
        if c1 == c2:
            cramers_results.loc[c1, c2] = 1.0
        else:
            cramers_results.loc[c1, c2] = cramers_v(df[c1], df[c2])

plt.figure(figsize=(10, 8))
sns.heatmap(cramers_results, annot=True, cmap='Blues', fmt=".2f", square=True)
plt.title("Cramér’s V Association between Categorical Variables", fontsize=14, pad=12)
plt.tight_layout()
plt.show()

--------------------
## Descriptive statistics
--------------------

In [None]:
df.describe(include = 'number').T

In [None]:
df.describe(include = 'object').T 

### Class balance plot

In [None]:
fig = plt.figure()
(df['y'].value_counts(normalize=True).sort_index()*100).plot(kind='bar')
plt.title('Class distribution (percentage)')
plt.ylabel('Percent')
plt.xlabel('y')
plt.show()

-------------------
# Data cleaning, feature selection
-------------------

### Dropping duplicated rows

In [None]:
df.drop_duplicates(inplace=True)
df.size

### Columns dropping based on the set description

The columns **`y`** (target variable) and **`duration`** will be removed from the modeling dataset to avoid data leakage,  
but both will be **saved separately** for later analysis and evaluation at the end of the project.

### Dropping columns with high correlation

In [None]:
df.drop(columns=['pdays', 'cons.price.idx', 'cons.conf.idx', 'nr.employed'], inplace=True)
df.columns

### Handling 'unknown' values

For categorical variables, we treated missing or `'unknown'` values as follows:  
- The variables **`education`** and **`default`** were **kept and modeled**, since the `'unknown'` category may carry useful information for prediction.  
- For **`job`**, **`marital`**, **`housing`**, and **`loan`**, we **dropped the rows** containing `'unknown'` values because their occurrence was very low (less than 3% of the dataset).  
This ensures data consistency while avoiding potential bias due to extremely rare missing categories.

In [None]:
df = df[df['housing'] != 'unknown']
df = df[df['loan'] != 'unknown']
df = df[df['job'] != 'unknown']
df = df[df['marital'] != 'unknown']
df['education'] = df['education'].replace('unknown', df['education'].mode()[0])
df['default'] = df['default'].replace('unknown', df['default'].mode()[0])

cat_cols = df.select_dtypes(include='object').columns
cat_info = pd.DataFrame({
    'Data Type': df[cat_cols].dtypes,
    'Unknown Count': [(df[c] == 'unknown').sum() for c in cat_cols],
    'Unknown (%)': [(df[c].eq('unknown').sum() / len(df)) * 100 for c in cat_cols],
    'Unique Values': [df[c].nunique() for c in cat_cols]
})

cat_info = cat_info.sort_values(by='Unknown Count', ascending=False)
cat_info.style.format({'Unknown (%)': '{:.2f}%'})


### Columns dropping based on the set description

The columns **`y`** (target variable) and **`duration`** will be removed from the modeling dataset to avoid data leakage,  
but both will be **saved separately** for later analysis and evaluation at the end of the project.

In [None]:
y = df['y'].copy()
duration = df['duration'].copy()

df.drop(columns=['duration', 'y'], inplace=True)

In [None]:
# Identify categorical and numerical columns
cat_cols = df.select_dtypes(include='object').columns.tolist()
num_cols = df.select_dtypes(exclude='object').columns.tolist()

# 1. Ordinal variables (with natural order)
ordinal_features = []
ordinal_mappings = []

if 'education' in cat_cols:
    ordinal_features.append('education')
    ordinal_mappings.append([
        'illiterate', 'basic.4y', 'basic.6y', 'basic.9y',
        'high.school', 'professional.course', 'university.degree', 'unknown'
    ])

if 'month' in cat_cols:
    ordinal_features.append('month')
    ordinal_mappings.append([
        'jan', 'feb', 'mar', 'apr', 'may', 'jun',
        'jul', 'aug', 'sep', 'oct', 'nov', 'dec'
    ])

# 2. Nominal variables (no natural order)
nominal_features = [c for c in cat_cols if c not in ordinal_features]

# 3. Define preprocessing pipelines
numeric_preprocess = Pipeline(steps=[
    ('scaler', StandardScaler())
])

ordinal_preprocess = Pipeline(steps=[
    ('encoder', OrdinalEncoder(categories=ordinal_mappings))
])

nominal_preprocess = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 4. Combine everything in a single ColumnTransformer
preprocess = ColumnTransformer(transformers=[
    ('num', numeric_preprocess, num_cols),
    ('ord', ordinal_preprocess, ordinal_features),
    ('nom', nominal_preprocess, nominal_features)
])

# Optional: test fit
preprocess.fit(df)

print("✅ Preprocessing pipeline ready.")
print(f"Numeric: {len(num_cols)} | Ordinal: {len(ordinal_features)} | Nominal: {len(nominal_features)}")


In [None]:
""" # Normalizar nomes das colunas
#df.columns = df.columns.str.strip().str.lower()

#   converter meses para valores ordinais
month_mapping = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6,
                 "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}

df["month_ordinal"] = df["month"].map(month_mapping)

#one hot encoding para meses
df_month = pd.get_dummies(df["month"], prefix="month")
df = pd.concat([df, df_month], axis=1)
df.drop("month", axis=1, inplace=True)

#converter dias da semana para valores ordinais
df_day = pd.get_dummies(df["day_of_week"], prefix="day")
df = pd.concat([df, df_day], axis=1)
df.drop("day_of_week", axis=1, inplace=True)


#importar min e max scaler 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['age', 'campaign', 'previous']] = scaler.fit_transform(df[['age', 'campaign', 'previous']])

#padronizar colunas categoricas
df = pd.get_dummies(df, columns=['job', 'marital', 'education', 'default', 
                                 'housing', 'loan', 'contact', 'poutcome'], drop_first=True)

#verificar se existe valores nulos  
print("\nValores nulos por coluna:")
print(df.isnull().sum())
 """



In [None]:
""" #Amostra exploratória estratificada
# df tem colunas: 'customer_id', 'date', 'y'
sample_frac = 0.05

def sample_group(x):
    n = max(1, int(round(len(x) * sample_frac)))  # pelo menos 1 por grupo
    return x.sample(n=n, random_state=42)

df_small = df.groupby('y', group_keys=False).apply(sample_group).reset_index(drop=True)


#Split temporal (treino/val/test por datas)
df['day_of_week'] = pd.to_datetime(df['day_of_week'])

# Define limites (ajusta as datas conforme os teus dados)
train_end = '2011-12-31'
val_end   = '2012-12-31'

train = df[df['day_of_week'] <= train_end].copy()
val   = df[(df['day_of_week'] > train_end) & (df['day_of_week'] <= val_end)].copy()
test  = df[df['day_of_week'] > val_end].copy()

print(len(train), len(val), len(test))

# oversampling
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

X_train = train.drop(columns=['y','customer_id','date'])
y_train = train['y']

# Pipeline exemplo
pipe = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('clf', LogisticRegression(max_iter=1000))
])

pipe.fit(X_train, y_train)
# GroupKFold por customer_id (para CV que evita vazamento entre clientes)
from sklearn.model_selection import GroupKFold, cross_val_score

X = train.drop(columns=['y','date'])
y = train['y']
groups = train['customer_id']

gkf = GroupKFold(n_splits=5)
for fold, (tr_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
    # aplica oversampling apenas em X_tr/y_tr se quiser


import numpy as np

def precision_at_k(y_true, y_score, k=0.05):
    # k como fracção do dataset (ex. 0.05 = top 5%)
    n = int(len(y_score) * k)
    idx = np.argsort(y_score)[::-1][:n]
    return y_true.iloc[idx].mean()

# após treinar o modelo e obter scores no test:
y_scores = model.predict_proba(X_test)[:,1]
print("Precision@5%:", precision_at_k(y_test.reset_index(drop=True), pd.Series(y_scores), k=0.05))
 """




In [None]:
# --- Modeling setup (final clean version) ---

# ✅ Ensure target separation
# (df already has only feature columns; 'y' was removed earlier)
X = df.copy()
y = y.map({'yes': 1, 'no': 0}) if y.dtype == 'object' else y

# ✅ Ensure preprocessing pipeline exists
assert 'preprocess' in globals(), "Preprocessing pipeline not found. Please run the Data Preparation section first."

# ✅ Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ✅ Define cross-validation and scoring metrics
scoring = {
    'roc_auc': 'roc_auc',
    'pr_auc': make_scorer(average_precision_score, needs_proba=True),
    'f1': 'f1',
    'bal_acc': 'balanced_accuracy',
    'mcc': make_scorer(matthews_corrcoef)
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ✅ Define models with shared preprocessing pipeline
models = [
    ('LogReg', Pipeline([
        ('prep', preprocess),
        ('clf', LogisticRegression(max_iter=1000, class_weight='balanced',
                                   solver='saga', random_state=42))
    ])),
    ('DecisionTree', Pipeline([
        ('prep', preprocess),
        ('clf', DecisionTreeClassifier(class_weight='balanced', random_state=42))
    ])),
    ('RandomForest', Pipeline([
        ('prep', preprocess),
        ('clf', RandomForestClassifier(n_estimators=400, class_weight='balanced',
                                       n_jobs=-1, random_state=42))
    ])),
    ('GradientBoosting', Pipeline([
        ('prep', preprocess),
        ('clf', GradientBoostingClassifier(random_state=42))
    ])),
    ('HistGradientBoosting', Pipeline([
        ('prep', preprocess),
        ('clf', HistGradientBoostingClassifier(random_state=42))
    ])),
    ('kNN', Pipeline([
        ('prep', preprocess),
        ('clf', KNeighborsClassifier(n_neighbors=7))
    ]))
]

# ✅ Function to compute cross-validation results
def cv_table(name, model):
    scores = cross_validate(model, X_train, y_train, cv=cv,
                            scoring=scoring, n_jobs=-1)
    results = {m: (scores[f'test_{m}'].mean(), scores[f'test_{m}'].std())
               for m in scoring}
    df_out = pd.DataFrame({k: {'mean': v[0], 'std': v[1]}
                           for k, v in results.items()}).T
    df_out['model'] = name
    return df_out.reset_index()

# ✅ Evaluate all models via cross-validation
cv_results = []
for name, mdl in models:
    cv_results.append(cv_table(name, mdl))

cv_df = pd.concat(cv_results, ignore_index=True)

# ✅ Summarize model performance
cv_summary = (
    cv_df.pivot(index='model', columns='index', values='mean')
          .loc[:, ['pr_auc', 'f1', 'roc_auc', 'bal_acc', 'mcc']]
          .sort_values(by=['pr_auc', 'f1', 'roc_auc'], ascending=False)
)

cv_summary


In [None]:
# --- Class rebalancing with SMOTE (oversampling minority class) ---

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Rebuild models using SMOTE inside the pipeline
smote_models = [
    ('LogReg_SMOTE', ImbPipeline([
        ('prep', preprocess),
        ('smote', SMOTE(random_state=42)),
        ('clf', LogisticRegression(max_iter=1000, solver='saga', random_state=42))
    ])),
    ('RandomForest_SMOTE', ImbPipeline([
        ('prep', preprocess),
        ('smote', SMOTE(random_state=42)),
        ('clf', RandomForestClassifier(n_estimators=400, n_jobs=-1, random_state=42))
    ])),
    ('HistGradientBoosting_SMOTE', ImbPipeline([
        ('prep', preprocess),
        ('smote', SMOTE(random_state=42)),
        ('clf', HistGradientBoostingClassifier(random_state=42))
    ]))
]

# Evaluate SMOTE models via cross-validation
cv_results_smote = []
for name, mdl in smote_models:
    cv_results_smote.append(cv_table(name, mdl))

cv_df_smote = pd.concat(cv_results_smote, ignore_index=True)

# Compare with previous results
cv_summary_smote = (
    cv_df_smote.pivot(index='model', columns='index', values='mean')
               .loc[:, ['pr_auc', 'f1', 'roc_auc', 'bal_acc', 'mcc']]
               .sort_values(by=['pr_auc', 'f1', 'roc_auc'], ascending=False)
)

cv_summary_smote
