In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [70]:
df = pd.read_csv('../data/bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [72]:
df.dropna(inplace=True)

In [73]:
from sklearn.preprocessing import PowerTransformer

In [74]:
# pdays : -1 means not previously contacted
df["pdays_contacted"] = (df["pdays"] != -1).astype(int)
df["pdays_days"] = df["pdays"].where(df["pdays"] != -1, 0)

# previous : count, but zero-inflated
df["previous_contacted"] = (df["previous"] > 0).astype(int)
df["previous_count"] = df["previous"]

# drop original columns
df = df.drop(columns=["pdays", "previous"])


In [75]:
df = df[df['job'] != 'unknown']
df['job'].value_counts()

job
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
Name: count, dtype: int64

In [76]:
for col in df.columns:
    if(df[col].dtype == 'object'):
        print(df[col].value_counts())
        print('================================\n')

job
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
Name: count, dtype: int64

marital
married     27011
single      12722
divorced     5190
Name: count, dtype: int64

education
secondary    23131
tertiary     13262
primary       6800
unknown       1730
Name: count, dtype: int64

default
no     44110
yes      813
Name: count, dtype: int64

housing
yes    25104
no     19819
Name: count, dtype: int64

loan
no     37683
yes     7240
Name: count, dtype: int64

contact
cellular     29154
unknown      12909
telephone     2860
Name: count, dtype: int64

month
may    13735
jul     6864
aug     6184
jun     5251
nov     3956
apr     2925
feb     2636
jan     1388
oct      727
sep      570
mar      474
dec      213
Name: count, dtype: int64

poutcome
unknown    36704
failure     4881
other       1838
succ

In [77]:
df.drop(columns=['poutcome', 'contact'], inplace=True)
df = df[df['education'] != 'unknown']



for col in df.columns:
    if(df[col].dtype == 'object'):
        print(df[col].value_counts())
        print('================================\n')

job
blue-collar      9278
management       9216
technician       7355
admin.           5000
services         4004
retired          2145
self-employed    1540
entrepreneur     1411
unemployed       1274
housemaid        1195
student           775
Name: count, dtype: int64

marital
married     25946
single      12219
divorced     5028
Name: count, dtype: int64

education
secondary    23131
tertiary     13262
primary       6800
Name: count, dtype: int64

default
no     42411
yes      782
Name: count, dtype: int64

housing
yes    24292
no     18901
Name: count, dtype: int64

loan
no     36086
yes     7107
Name: count, dtype: int64

month
may    13192
jul     6601
aug     6037
jun     4980
nov     3842
apr     2820
feb     2533
jan     1318
oct      690
sep      532
mar      448
dec      200
Name: count, dtype: int64

y
no     38172
yes     5021
Name: count, dtype: int64



In [78]:
X = df.drop(columns=["y"])
y = df["y"].map({"no": 0, "yes": 1})


In [79]:
binary_cols = [
    "default", "housing", "loan",
    "pdays_contacted", "previous_contacted"
]

ordinal_cols = ["education"]

nominal_cols = [
    "job", "marital"
]

cyclic_cols = ["month"]

# numeric columns that are strongly skewed
skewed_numeric_cols = [
    "balance",
    "duration",
    "campaign",
    "pdays_days",
    "previous_count"
]

# remaining numeric columns (mild / roughly symmetric)
# (day is day of month)
other_numeric_cols = [
    "age",
    "day"
]


In [80]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class MonthCyclicEncoder(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.month_map = {
            "jan":1, "feb":2, "mar":3, "apr":4,
            "may":5, "jun":6, "jul":7, "aug":8,
            "sep":9, "oct":10, "nov":11, "dec":12
        }

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        m = X.iloc[:, 0].map(self.month_map).astype(float)
        sin = np.sin(2 * np.pi * m / 12)
        cos = np.cos(2 * np.pi * m / 12)
        return np.c_[sin, cos]

    def get_feature_names_out(self, input_features=None):
        return np.array(["month_sin", "month_cos"])


In [81]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PowerTransformer


In [82]:
preprocessor = ColumnTransformer(
    transformers=[

        # skewed numeric
        (
            "num_skewed",
            PowerTransformer(method="yeo-johnson", standardize=True),
            skewed_numeric_cols
        ),

        # other numeric
        (
            "num",
            StandardScaler(),
            other_numeric_cols
        ),

        # binary yes / no + engineered binary flags
        (
            "bin",
            OrdinalEncoder(categories=[["no", "yes"]] * 3 + [[0, 1]] * 2),
            binary_cols
        ),

        # education (ordered)
        (
            "edu",
            OrdinalEncoder(categories=[["primary", "secondary", "tertiary"]]),
            ordinal_cols
        ),

        # nominal categoricals
        (
            "nom",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            nominal_cols
        ),

        # month â†’ sin, cos
        (
            "month",
            MonthCyclicEncoder(),
            cyclic_cols
        )
    ],
    remainder="drop"
)


In [83]:
from sklearn.model_selection import train_test_split
import pandas as pd

# -----------------------------
# target / features
# -----------------------------
X = df.drop(columns=["y"])
y = df["y"].map({"no": 0, "yes": 1})

# -----------------------------
# split
# -----------------------------
X_pool, X_test, y_pool, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# -----------------------------
# fit only on pool
# -----------------------------
X_pool_p = preprocessor.fit_transform(X_pool)
X_test_p = preprocessor.transform(X_test)

# -----------------------------
# feature names
# -----------------------------
feature_names = preprocessor.get_feature_names_out()

# -----------------------------
# save with column names
# -----------------------------
train_df = pd.DataFrame(X_pool_p, columns=feature_names)
train_df["y"] = y_pool.values

test_df = pd.DataFrame(X_test_p, columns=feature_names)
test_df["y"] = y_test.values

train_df.to_csv("../data/bank_marketing_train.csv", index=False)
test_df.to_csv("../data/bank_marketing_test.csv", index=False)

print("Saved with feature names.")


Saved with feature names.
